def is_valid_jsonl(
file_path: str, media_folder: str, log_file: str | None = None
) -> bool:
"""
Check if a file is a valid jsonl file and can be read line by line
and if "prompt" is a key in all lines of the file.
Parameters
----------
file_path : str
Path to the jsonl file to be checked.
media_folder : str
String containing the path to the media folder to be used.
log_file : str | None
Path to the error log file.
Only used if the file is deemed invalid.
Log will include the errors that caused the file to fail validation
and the line numbers of the errors.
If None, no error log file will be created. Default is None.
Returns
-------
bool
True if the file is a valid jsonl file, False otherwise.
"""
multimedia_path_errors = set()
valid_indicator = True
if log_file is None:
log_file = os.path.basename(file_path).replace(".jsonl", "-error-log.txt")
logging.info("Log file not provided. Generating one in current directory")
logging.info(
f"Checking {file_path}. Any errors will be saved to log file at {log_file}"
)
if log_file is not None:
with open(log_file, "a") as log:
log.write("\n")
write_log_message(log_file=log_file, log_message="Running checks...", log=True)
model_environments_to_check = set()
with open(file_path, "r") as f:
for i, line in enumerate(f):
issues = []
try:
# check if line is a valid json
data = json.loads(line)
# check if "prompt" is a key in the json
if "prompt" not in data:
# if "prompt" is not a key, add index to list
issues.append(KeyError('"prompt" key not found'))
# check if "api" is a key in the json
if "api" not in data:
# if "api" is not a key, add index to list
issues.append(KeyError('"api" key not found'))
# check if "model_name" is a key in the json
if "model_name" not in data:
# if "model_name" is not a key, add index to list
issues.append(KeyError('"model_name" key not found'))
# if parameters is passed, check its a dictionary
if "parameters" in data:
if type(data["parameters"]) is not dict:
issues.append(
TypeError(
'"parameters" value must be a dictionary if provided'
)
)
# if multimedia is passed, check its a dictionary
if "multimedia" in data:
multimedia_issues, path_errors = check_multimedia(
data["multimedia"], media_folder
)
issues.extend(multimedia_issues)
multimedia_path_errors.union(path_errors)
if "api" in data:
if data["api"] not in ASYNC_APIS:
issues.append(
NotImplementedError(
f"Model {data['api']} is not a valid model. "
f"Please check the model name"
)
)
else:
# model specific checks
issues.extend(ASYNC_APIS[data["api"]].check_prompt_dict(data))
# add model to set of models to check environment variables for
model_environments_to_check.add(data["api"])
except json.JSONDecodeError as err:
# if line is not a valid json, add index to list
issues.append(err)
if len(issues) != 0:
if not all(isinstance(item, Warning) for item in issues):
valid_indicator = False
# log the issues
log_msg = f"Line {i} has the following issues: {issues}"
if log_file is not None:
write_log_message(log_file=log_file, log_message=log_msg, log=True)
# check environment variables for each model
environment_issues = []
for model in model_environments_to_check:
environment_issues.extend(ASYNC_APIS[model].check_environment_variables())
if len(environment_issues) != 0:
if not all(isinstance(item, Warning) for item in environment_issues):
valid_indicator = False
log_msg = (
f"File {file_path} has the following environment variables "
f"that aren't set: {environment_issues}"
)
write_log_message(log_file=log_file, log_message=log_msg)
if len(multimedia_path_errors) != 0:
valid_indicator = False
log_msg = (
f"File {file_path} includes the following multimedia paths "
f"that do not exist: {multimedia_path_errors}"
)
write_log_message(log_file=log_file, log_message=log_msg)
if not valid_indicator:
log_msg = f"File {file_path} is an invalid jsonl file"
write_log_message(log_file=log_file, log_message=log_msg)
else:
logging.info(
f"File {file_path} is a valid jsonl file. But check if there's any warnings in the logs"
)
return valid_indicator