Source code for macrosynergy.download.extra
import os
import logging
import json
import pickle
from pathlib import Path
from typing import Any, Union
import pandas as pd
logger = logging.getLogger(__name__)
[docs]def save_data(path: Union[str, Path], data: Any) -> Path:
"""Save data locally, choosing the file format based on the data type.
DataFrames are saved as Parquet, dicts and lists as JSON, and all other
objects as pickle. The extension in "path" is replaced with the appropriate
suffix regardless of what was originally supplied.
Parameters
----------
path : str or Path
Destination path. The extension is overridden based on the data type.
data : Any
Object to persist. Its type determines the output format.
Returns
-------
Path
Path of the file that was written, with the correct extension applied:
".parquet" for DataFrames, ".json" for dicts and lists, and ".pkl" for
everything else.
"""
path = Path(path)
if not path.parent.exists():
logger.error("Directory does not exist: %s", path.parent)
raise FileNotFoundError(f"Directory does not exist: {path.parent}")
if isinstance(data, pd.DataFrame):
out = path.with_suffix(".parquet")
logger.info("Saving DataFrame as Parquet to %s", out)
data.to_parquet(out)
elif isinstance(data, (dict, list)):
out = path.with_suffix(".json")
logger.info("Saving %s as JSON to %s", type(data).__name__, out)
with open(out, "w") as f:
json.dump(data, f)
else:
out = path.with_suffix(".pkl")
logger.warning(
"Saving %s as pickle to %s. Pickle files are not portable across "
"Python versions or environments.",
type(data).__name__,
out,
)
with open(out, "wb") as f:
pickle.dump(data, f)
logger.info("Data saved successfully to %s", out)
return out
[docs]def load_data(path: Union[str, Path]) -> Any:
"""Load data previously saved by "save_data", inferring the format from the file extension.
If "path" has no extension, the function probes for ".parquet", ".json",
and ".pkl" files in that order and loads the first match found.
Parameters
----------
path : str or Path
Path to the saved file. The extension may be omitted if the file was
written by "save_data".
Returns
-------
Any
The deserialized data: a DataFrame for ".parquet" files, a dict or list
for ".json" files, and the original pickled object for ".pkl" files.
"""
path = Path(path)
if not path.exists():
logger.debug("No file at %s, probing for known extensions.", path)
for suffix in (".parquet", ".json", ".pkl"):
candidate = path.with_suffix(suffix)
if candidate.exists():
logger.debug("Found %s, using it.", candidate)
path = candidate
break
else:
logger.error("No cached file found at %s", path)
raise FileNotFoundError(f"No cached file found at {path}")
logger.info("Loading data from %s", path)
if path.suffix == ".parquet":
return pd.read_parquet(path)
elif path.suffix == ".json":
with open(path) as f:
return json.load(f)
else:
logger.warning(
"Loading pickle file %s. Pickle files are not portable across "
"Python versions or environments.",
path,
)
with open(path, "rb") as f:
return pickle.load(f)