Skip to content

check_experiment

is_valid_jsonl

is_valid_jsonl(
    file_path: str, media_folder: str, log_file: str | None = None
) -> bool

Check if a file is a valid jsonl file and can be read line by line and if “prompt” is a key in all lines of the file.

Parameters:

Name Type Description Default
file_path str

Path to the jsonl file to be checked.

required
media_folder str

String containing the path to the media folder to be used.

required
log_file str | None

Path to the error log file. Only used if the file is deemed invalid. Log will include the errors that caused the file to fail validation and the line numbers of the errors. If None, no error log file will be created. Default is None.

None

Returns:

Type Description
bool

True if the file is a valid jsonl file, False otherwise.

Source code in src/prompto/scripts/check_experiment.py
def is_valid_jsonl(
    file_path: str, media_folder: str, log_file: str | None = None
) -> bool:
    """
    Check if a file is a valid jsonl file and can be read line by line
    and if "prompt" is a key in all lines of the file.

    Parameters
    ----------
    file_path : str
        Path to the jsonl file to be checked.
    media_folder : str
        String containing the path to the media folder to be used.
    log_file : str | None
        Path to the error log file.
        Only used if the file is deemed invalid.
        Log will include the errors that caused the file to fail validation
        and the line numbers of the errors.
        If None, no error log file will be created. Default is None.

    Returns
    -------
    bool
        True if the file is a valid jsonl file, False otherwise.
    """
    multimedia_path_errors = set()
    valid_indicator = True
    if log_file is None:
        log_file = os.path.basename(file_path).replace(".jsonl", "-error-log.txt")
        logging.info("Log file not provided. Generating one in current directory")

    logging.info(
        f"Checking {file_path}. Any errors will be saved to log file at {log_file}"
    )

    if log_file is not None:
        with open(log_file, "a") as log:
            log.write("\n")
        write_log_message(log_file=log_file, log_message="Running checks...", log=True)

    model_environments_to_check = set()
    with open(file_path, "r") as f:
        for i, line in enumerate(f):
            issues = []
            try:
                # check if line is a valid json
                data = json.loads(line)

                # check if "prompt" is a key in the json
                if "prompt" not in data:
                    # if "prompt" is not a key, add index to list
                    issues.append(KeyError('"prompt" key not found'))

                # check if "api" is a key in the json
                if "api" not in data:
                    # if "api" is not a key, add index to list
                    issues.append(KeyError('"api" key not found'))

                # check if "model_name" is a key in the json
                if "model_name" not in data:
                    # if "model_name" is not a key, add index to list
                    issues.append(KeyError('"model_name" key not found'))

                # if parameters is passed, check its a dictionary
                if "parameters" in data:
                    if type(data["parameters"]) is not dict:
                        issues.append(
                            TypeError(
                                '"parameters" value must be a dictionary if provided'
                            )
                        )

                # if multimedia is passed, check its a dictionary
                if "multimedia" in data:
                    multimedia_issues, path_errors = check_multimedia(
                        data["multimedia"], media_folder
                    )
                    issues.extend(multimedia_issues)
                    multimedia_path_errors.union(path_errors)

                if "api" in data:
                    if data["api"] not in ASYNC_APIS:
                        issues.append(
                            NotImplementedError(
                                f"Model {data['api']} is not a valid model. "
                                f"Please check the model name"
                            )
                        )
                    else:
                        # model specific checks
                        issues.extend(ASYNC_APIS[data["api"]].check_prompt_dict(data))
                        # add model to set of models to check environment variables for
                        model_environments_to_check.add(data["api"])
            except json.JSONDecodeError as err:
                # if line is not a valid json, add index to list
                issues.append(err)

            if len(issues) != 0:
                if not all(isinstance(item, Warning) for item in issues):
                    valid_indicator = False
                # log the issues
                log_msg = f"Line {i} has the following issues: {issues}"
                if log_file is not None:
                    write_log_message(log_file=log_file, log_message=log_msg, log=True)

    # check environment variables for each model
    environment_issues = []
    for model in model_environments_to_check:
        environment_issues.extend(ASYNC_APIS[model].check_environment_variables())

    if len(environment_issues) != 0:
        if not all(isinstance(item, Warning) for item in environment_issues):
            valid_indicator = False
        log_msg = (
            f"File {file_path} has the following environment variables "
            f"that aren't set: {environment_issues}"
        )
        write_log_message(log_file=log_file, log_message=log_msg)

    if len(multimedia_path_errors) != 0:
        valid_indicator = False
        log_msg = (
            f"File {file_path} includes the following multimedia paths "
            f"that do not exist: {multimedia_path_errors}"
        )
        write_log_message(log_file=log_file, log_message=log_msg)

    if not valid_indicator:
        log_msg = f"File {file_path} is an invalid jsonl file"
        write_log_message(log_file=log_file, log_message=log_msg)
    else:
        logging.info(
            f"File {file_path} is a valid jsonl file. But check if there's any warnings in the logs"
        )

    return valid_indicator