diff --git a/.gitignore b/.gitignore index 9bc9200..378607e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,16 +28,12 @@ lib64/ MANIFEST sdist/ var/ -venv.bak/ -venv/ +*venv.bak/ +*venv/ wheels/ - .nox/ -.tox/ .vscode/ .ruff_cache .cache/ -*_venv/ - **/*.csv **/*.xlsx \ No newline at end of file diff --git a/README.md b/README.md index 54a86f0..9379e9c 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ # getfactormodels ![Python 3.11](https://img.shields.io/badge/Python-3.7+-306998.svg?logo=python&logoColor=ffde57&style=flat-square) ![PyPI - Version](https://img.shields.io/pypi/v/getfactormodels?style=flat-square&label=PyPI) +![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) Reliably retrieve data for various multi-factor asset pricing models. @@ -27,141 +28,145 @@ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Ma `getfactormodels` requires Python ``>=3.7`` -* Install with pip: +* The easiest way to install getfactormodels is via pip: + ```shell - pip install getfactormodels + $ pip install getfactormodels ``` ## Usage ->[!WARNING] ->Please be aware that `getfactormodels` was recently released (Dec 20, 2023) and is not stable while this message is displayed. +>[!IMPORTANT] +>![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) > -#### Python +>``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything. -After installing, import ``getfactormodels`` and call ``get_factors()`` with the ``model`` and ``frequency`` parameters. Optionally, specify a ``start_date`` and ``end_date`` -* For example, to retrieve the daily q-factor model data: +After installation, import and call the ``get_factors()`` function with the ``model`` and ``frequency`` params: - ```py - import getfactormodels - - getfactormodels.get_factors(model='q', frequency='d') - ``` - > _Trimmed output:_ - ```txt - > df - Mkt-RF R_ME R_IA R_ROE R_EG RF - date - 1967-01-03 0.000778 0.004944 0.001437 -0.007118 -0.008563 0.000187 - 1967-01-04 0.001667 -0.003487 -0.000631 -0.002044 -0.000295 0.000187 - 1967-01-05 0.012990 0.004412 -0.005688 0.000838 -0.003075 0.000187 - 1967-01-06 0.007230 0.006669 0.008897 0.003603 0.002669 0.000187 - 1967-01-09 0.008439 0.006315 0.000331 0.004949 0.002979 0.000187 - ... ... ... ... ... ... ... - 2022-12-23 0.005113 -0.001045 0.004000 0.010484 0.003852 0.000161 - 2022-12-27 -0.005076 -0.001407 0.010190 0.009206 0.003908 0.000161 - 2022-12-28 -0.012344 -0.004354 0.000133 -0.010457 -0.004953 0.000161 - 2022-12-29 0.018699 0.008568 -0.008801 -0.012686 -0.002162 0.000161 - 2022-12-30 -0.002169 0.001840 0.001011 -0.004151 -0.003282 0.000161 - - [14096 rows x 6 columns] - ``` - - * or, retreive the monthly liquidity factors of Pastor and Stambaugh for the 1990s: - - ```py - import getfactormodels as getfactormodels - - df = getfactormodels.get_factors(model='liquidity', frequency='m', start_date='1990-01-01', end_date='1999-12-31') - ``` - > If you don't have time to type `liquidity`, type `liq`, or `ps`--there's a handy regex. +* For example, retrieving the monthly ${q}^{5}$ factor model: + + ```python + import getfactormodels + + data = getfactormodels.get_factors(model='q', frequency='m') + ``` - * or, saving the monthly 3-factor model of Fama & French to a file: + > _Trimmed output:_ + + ```txt + > print(data) + Mkt-RF R_ME R_IA R_ROE R_EG RF + date + 1967-01-03 0.000778 0.004944 0.001437 -0.007118 -0.008563 0.000187 + 1967-01-04 0.001667 -0.003487 -0.000631 -0.002044 -0.000295 0.000187 + 1967-01-05 0.012990 0.004412 -0.005688 0.000838 -0.003075 0.000187 + 1967-01-06 0.007230 0.006669 0.008897 0.003603 0.002669 0.000187 + 1967-01-09 0.008439 0.006315 0.000331 0.004949 0.002979 0.000187 + ... ... ... ... ... ... ... + 2022-12-23 0.005113 -0.001045 0.004000 0.010484 0.003852 0.000161 + 2022-12-27 -0.005076 -0.001407 0.010190 0.009206 0.003908 0.000161 + 2022-12-28 -0.012344 -0.004354 0.000133 -0.010457 -0.004953 0.000161 + 2022-12-29 0.018699 0.008568 -0.008801 -0.012686 -0.002162 0.000161 + 2022-12-30 -0.002169 0.001840 0.001011 -0.004151 -0.003282 0.000161 + + [14096 rows x 6 columns] + ``` - ```py - import getfactormodels as gfm +* Retrieving the daily data for the Fama-French 3-factor model, since `start_date`: - df = gfm.get_factors(model='ff3', frequency='m', output="ff3_data.csv") - ``` - >The output parameter accepts a filename, path or directory, and can be one of csv, md, txt, xlsx, pkl. + ```python + import getfactormodels as gfm -* You can also import just the models that you need.: + df = gfm.get_factors(model='ff3', frequency='d', start_date=`2006-01-01`) + ``` - * For example, to import only the *ICR* and *q*-factor models: +* Retrieving data for Stambaugh and Yuan's monthly *Mispricing* factors, between `start_date` and `end_date`, and saving the data to a file: - ```py - from getfactormodels import icr_factors, q_factors + ```python + import getfactormodels as gfm + + df = gfm.get_factors(model='mispricing', start_date='1970-01-01', end_date=1999-12-31, output='mispricing_factors.csv') + ``` - # Passing a model function with no params defaults to monthly. - df = icr_factors() + >``output`` can be a filename, directory, or path. If no extension is specified, defaults to .csv (can be one of: .xlsx, .csv, .txt, .pkl, .md) - # The 'q' models, and the 3-factor model of Fama-French also have weekly data. - df = q_factors(frequency="W", start_date="1992-01-01) - ``` +You can import only the models that you need: - * If using ``ff_factors()``, then an additional ``model`` parameter should be specified: +* For example, to import only the *ICR* and *q-factor* models: - ```py - from getfactormodels import ff_factors + ```python + from getfactormodels import icr_factors, q_factors - # To get annual data for the 5-factor model: - data = ff_factors(model="5", frequency="Y", output=".xlsx") + # Passing a model function without params defaults to monthly data. + df = icr_factors() - # Daily 3-factor model data, since 1970 (not specifying an end date - # will return data up until today): - data = ff_factors(model="3", frequency="D", start_date="1970-01-01") - ``` - > Output allows just an extension to be specified. + # The 'q' models, and the 3-factor model of Fama-French have weekly data available: + df = q_factors(frequency="W", start_date="1992-01-01, output='.xlsx') + ``` -* or import all the models: + >``output`` allows just a file extension (with the `.`, else it'll be passed as a filename). - ```py - from getfactormodels.models import models +* When using `ff_factors()`, specify an additional `model` parameter (**this might be changed**): - df = models.barillas_shanken_factors('m') + ```python + # To get annual data for the 5-factor model: + data = ff_factors(model="5", frequency="Y", output=".xlsx") + + # Daily 3-factor model data, since 1970 (not specifying an end date + # will return data up until today): + data = ff_factors(model="3", frequency="D", start_date="1970-01-01") ``` -* There's also the `FactorExtractor` class that the CLI uses (it doesn't really do a whole lot yet): +There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly used by the CLI; lots to do): ```python - from getfactormodels import FactorExtractor + from getfactormodels import FactorExtractor - fe = FactorExtractor(model='carhart', frequency='m', start_date='1980-01-01', end_date='1980-05-01') - fe.get_factors() - fe.to_file('carhart_factors.md') - ``` + fe = FactorExtractor(model='carhart', start_date='1980-01-01', end_date='1980-05-01) + fe.get_factors() + fe.drop_rf() + fe.to_file('~/carhart_factors.md') + ``` - * _The resulting ``carhart_factors.md`` file will look like this:_ +* _The resulting ``carhart_factors.md`` file will look like this:_ - | date | Mkt-RF | SMB | HML | MOM | RF | - |:--------------------|---------:|--------:|--------:|--------:|-------:| - | 1980-01-31 00:00:00 | 0.0551 | 0.0162 | 0.0175 | 0.0755 | 0.008 | - | 1980-02-29 00:00:00 | -0.0122 | -0.0185 | 0.0061 | 0.0788 | 0.0089 | - | 1980-03-31 00:00:00 | -0.129 | -0.0664 | -0.0101 | -0.0955 | 0.0121 | - | 1980-04-30 00:00:00 | 0.0397 | 0.0105 | 0.0106 | -0.0043 | 0.0126 | + | date | Mkt-RF | SMB | HML | MOM | + |:--------------------|---------:|--------:|--------:|--------:| + | 1980-01-31 00:00:00 | 0.0551 | 0.0162 | 0.0175 | 0.0755 | + | 1980-02-29 00:00:00 | -0.0122 | -0.0185 | 0.0061 | 0.0788 | + | 1980-03-31 00:00:00 | -0.129 | -0.0664 | -0.0101 | -0.0955 | + | 1980-04-30 00:00:00 | 0.0397 | 0.0105 | 0.0106 | -0.0043 | + +>``.drop_rf()`` will return the DataFrame without the `RF` column. You can also drop the "Mkt-RF" column with ``.drop_mkt()`` +### CLI -#### Using the CLI -* You can also use getfactormodels from the command line. +``bash >=4.2`` - ```bash - $ getfactormodels -h +* You can also use getfactormodels from the command line. It's very barebones, here's the `-h`: - usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] - ``` + ```shell + $ getfactormodels -h + + usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] [--no_mkt] + ``` * An example of how to use the CLI to retrieve the Fama-French 3-factor model data: - ```bash - getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output "filename.csv" - ``` - > Accepted file extensions are .csv, .txt, .xlsx, and .md. If no extension is given, the output file will be .csv. The --output flag allows a filename, filepath or a directory. If only an extension is provided (including the . else it'll be passed as a filename), a name will be generated. - -* Here's another example that retrieves the annual Fama-French 5-factor data without the RF column: - ```sh - getfactormodels -m 5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx + ```shell + $ getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output ".csv" ``` - > `--no_rf` will return the factor model without an RF column. + +* Here's another example that retrieves the annual Fama-French 5-factor data without the RF column (using ``--no_rf``) + + ```shell + $ getfactormodels -m ff5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx + ``` +* To return the factors without the risk-free rate `RF`, or the excess market return `Mkt-RF`, columns: + +## Data Availability + +>[TODO] ## References 1. E. F. Fama and K. R. French, ‘Common risk factors in the returns on stocks and bonds’, *Journal of Financial Economics*, vol. 33, no. 1, pp. 3–56, 1993. [PDF](https://people.duke.edu/~charvey/Teaching/BA453_2006/FF_Common_risk.pdf) @@ -200,3 +205,15 @@ After installing, import ``getfactormodels`` and call ``get_factors()`` with the [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat-square&labelColor=ef8336)](https://pycqa.github.io/isort/) [![Ruff](https://img.shields.io/badge/-ruff-%23261230?style=flat-square&logo=ruff&logoColor=d7ff64)](https://simpleicons.org/?q=ruff) +--- + +#### Known issues + +* The first `hml_devil_factors()` retrieval is slow, because the download from aqr.com is slow. It's the only model, so far, implementing a cache—daily data expires at the end of the day and is only re-downloaded when the requested`end_date` exceeds the file's last index date. Similar for monthly, expiring at EOM and re-downloaded when needed. + +#### Todo + +- [ ] Docs + - [ ] Examples +- [ ] Tests +- [ ] Error handling diff --git a/getfactormodels/__init__.py b/getfactormodels/__init__.py index 91ca3f1..a480868 100644 --- a/getfactormodels/__init__.py +++ b/getfactormodels/__init__.py @@ -10,8 +10,8 @@ # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -20,10 +20,10 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -__version__ = "0.0.3" +__version__ = "0.0.4" from .__main__ import FactorExtractor, get_factors -from .models import models # noqa: F401 +from .models import models # noqa: F401, RUF100 (silent flake8 in VScode) from .models.models import (barillas_shanken_factors, carhart_factors, dhs_factors, ff_factors, hml_devil_factors, icr_factors, liquidity_factors, mispricing_factors, diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index e54ceb4..6660592 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -1,12 +1,16 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -import os +from pathlib import Path +from typing import Optional import pandas as pd from dateutil import parser # ruff: noqa: RUF100 -from getfactormodels.models.models import (barillas_shanken_factors, # noqa: F401, E501 - carhart_factors, dhs_factors, - ff_factors, hml_devil_factors, - icr_factors, liquidity_factors, +from getfactormodels.models.models import \ + barillas_shanken_factors # noqa: F401 +from getfactormodels.models.models import carhart_factors # noqa: F401, E501 +from getfactormodels.models.models import (dhs_factors, ff_factors, + hml_devil_factors, icr_factors, + liquidity_factors, mispricing_factors, q_classic_factors, q_factors) from getfactormodels.utils.cli import parse_args @@ -14,10 +18,10 @@ def get_factors(model: str = "3", - frequency: str = "M", - start_date=None, - end_date=None, - output=None) -> pd.DataFrame: + frequency: Optional[str] = "M", + start_date: Optional[str] = None, + end_date: Optional[str] = None, + output: Optional[str] = None) -> pd.DataFrame: """Get data for a specified factor model. Return a DataFrame containing the data for the specified model and @@ -58,6 +62,7 @@ def get_factors(model: str = "3", raise ValueError(f"Invalid model: {model}") df = function(frequency, start_date, end_date, output) + return df @@ -78,11 +83,11 @@ class FactorExtractor: """ def __init__(self, - model='3', - frequency='M', - start_date=None, - end_date=None, - output=None): + model: str = '3', + frequency: Optional[str] = 'M', + start_date: Optional[str] = None, + end_date: Optional[str] = None, + output: Optional[str] = None): self.model: str = model self.frequency: str = frequency self.start_date = self.validate_date_format(start_date) if start_date \ @@ -91,14 +96,19 @@ def __init__(self, else None self.output = output self._no_rf = False + self._no_mkt = False self.df = None - def no_rf(self): + def no_rf(self) -> None: """Sets the _no_rf flag to True.""" self._no_rf = True + def no_mkt(self) -> None: + """Sets the _no_mkt flag to True.""" + self._no_mkt = True + @staticmethod - def validate_date_format(date_string): + def validate_date_format(date_string: str) -> str: """ Validate the date format. @@ -108,7 +118,8 @@ def validate_date_format(date_string): try: return parser.parse(date_string).strftime("%Y-%m-%d") except ValueError as err: - raise ValueError("Incorrect date format, use YYYY-MM-DD.") from err + error_message = "Incorrect date format, use YYYY-MM-DD." + raise ValueError(error_message) from err def get_factors(self) -> pd.DataFrame: """Fetch the factor data and store it in the class.""" @@ -116,15 +127,22 @@ def get_factors(self) -> pd.DataFrame: model=self.model, frequency=self.frequency, start_date=self.start_date, - end_date=self.end_date) + end_date=self.end_date, + output=self.output) if self._no_rf: - self.df = self.drop_rf(self.df) + self.df = self.drop_rf(self.df.copy()) # create a copy before drop + if self._no_mkt: + self.df = self.drop_mkt(self.df.copy()) return self.df - def drop_rf(self, df): + def drop_rf(self, df: pd.DataFrame = None) -> pd.DataFrame: """Drop the ``RF`` column from the DataFrame.""" + # get_factors if not already done + if df is None: + df = self.get_factors() + if "RF" in df.columns: df = df.drop(columns=["RF"]) else: @@ -132,7 +150,19 @@ def drop_rf(self, df): return df - def to_file(self, filename): + def drop_mkt(self, df: pd.DataFrame = None) -> pd.DataFrame: + """Drop the ``MKT`` column from the DataFrame.""" + if df is None: + df = self.get_factors() + + if "Mkt-RF" in df.columns: + df = df.drop(columns=["Mkt-RF"]) + else: + print("`drop_mkt` was called but no MKT column was found.") + + return df + + def to_file(self, filename: str): """ Save the factor data to a file. @@ -153,12 +183,15 @@ def main(): start_date=args.start, end_date=args.end) if args.no_rf: extractor.no_rf() + if args.no_mkt: + extractor.no_mkt() df = extractor.get_factors() if args.output: extractor.to_file(args.output) - print(f'File saved to "{os.path.abspath(args.output)}"') + print(f'File saved to "{Path(args.output).resolve()}"') + else: print(df) diff --git a/getfactormodels/models/__init__.py b/getfactormodels/models/__init__.py index 9be4f54..e12f70d 100644 --- a/getfactormodels/models/__init__.py +++ b/getfactormodels/models/__init__.py @@ -20,5 +20,4 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from . import ff_models # noqa: F401 - TODO: disable 401 in all __init__ -from . import models # noqa: F401 +from . import ff_models, models diff --git a/getfactormodels/models/ff_models.py b/getfactormodels/models/ff_models.py index 84a9228..9b82af7 100644 --- a/getfactormodels/models/ff_models.py +++ b/getfactormodels/models/ff_models.py @@ -20,19 +20,23 @@ models construction. """ +# ruff: noqa: PLR2004 +from __future__ import annotations +from typing import Optional import numpy as np -import pandas as pd # noqa: D100 +import pandas as pd from ..utils.utils import ( # noqa - todo: fix relative import from parent modules banned _process, get_zip_from_url) -def _ff_construct_url(model="3", frequency="M"): +def _ff_construct_url(model: str = "3", frequency: str = "M") -> str: """Construct and return the URL for the specified model and frequency.""" frequency = frequency.upper() - if frequency == "W" and model not in ["3", "4"]: # why 4? - raise ValueError("Weekly data is only available for the Fama \ - French 3 factor model at the moment.") + if frequency == "W" and model not in ["3", "4"]: + error_message = "Weekly data is only available for the Fama French \ + 3 factor model at the moment." + raise ValueError(error_message) base_url = "https://mba.tuck.dartmouth.edu" ftp = "pages/faculty/ken.french/ftp" @@ -48,7 +52,8 @@ def _ff_construct_url(model="3", frequency="M"): return f"{base_url}/{ftp}/{file}" -def _ff_read_csv_from_zip(zip_file, model=None): +def _ff_read_csv_from_zip(zip_file, + model: Optional[str] = None) -> pd.DataFrame: """Read the FF Factors CSV into a dataframe.""" try: filename = zip_file.namelist()[0] @@ -72,7 +77,8 @@ def _ff_read_csv_from_zip(zip_file, model=None): return data -def _ff_process_data(data, model, frequency) -> pd.DataFrame: +def _ff_process_data(data: pd.DataFrame, + model, frequency) -> pd.DataFrame: """Process and return the data based on the provided model and frequency. """ frequency = frequency.lower() @@ -105,7 +111,7 @@ def _ff_process_data(data, model, frequency) -> pd.DataFrame: return data -def _ff_get_mom(frequency) -> pd.Series: +def _ff_get_mom(frequency: str = "M") -> pd.Series: """Fetch and return the momentum factor data as a pd.Series. * Note: only for returning the raw data for the 4 and 6 factor models. """ @@ -128,8 +134,8 @@ def _ff_get_mom(frequency) -> pd.Series: def _get_ff_factors(model: str = "3", frequency: str = "M", - start_date=None, - end_date=None) -> pd.DataFrame: + start_date: Optional[str] = None, + end_date: Optional[str] = None) -> pd.DataFrame: """Return the Fama French 3, 5, or 6, or Carhart 4 factor model data. * Note: This is the function that's called by get_ff_factors in main. @@ -138,13 +144,16 @@ def _get_ff_factors(model: str = "3", frequency = "M" if frequency.upper() not in ["D", "M", "Y", "W"]: - raise ValueError("Frequency must be one of: D, M, Y, or W.") + err_msg = "Invalid frequency passed to get_ff_factors: " + err_msg += f" Frequency '{frequency}' not in ff_model `{model}`." + raise ValueError(err_msg) + elif model not in ["3", "5", "6", "4"]: - raise ValueError(f"Invalid model passed to private function \ - _get_ff_factors, must be one of: 3, 5, 6, or 4, \ - not {model}. If you see this error message please \ - submit an issue at:\ - https://github.com/x512/getfactormodels/issues/") + err_msg = "Invalid model passed to get_ff_factors, must be one of: " + err_msg += "3, 5, 6, or 4, not {model}." + err_msg += "If you see this error message please submit an issue at:" + err_msg += " https://github.com/x512/getfactormodels/issues/" + raise ValueError(err_msg) url = _ff_construct_url(model, frequency) zip = get_zip_from_url(url) diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index 341fece..d3c364d 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -25,19 +25,20 @@ - ``barillas_shanken_factors`` relies on ``hml_devil_factors``, so it's also slow. """ +from __future__ import annotations import datetime from io import BytesIO +from pathlib import Path from typing import Optional, Union -import cachetools +import diskcache as dc import numpy as np import pandas as pd import requests from getfactormodels.utils.utils import _process, get_file_from_url from .ff_models import _get_ff_factors -# TODO: "PEP 484 prohibits implicit `Optional`" see: RUFF013. -def ff_factors(model: str = "3", # TODO: fix: _get_ff_factors filepath param +def ff_factors(model: str = "3", frequency: str = "M", start_date: Optional[str] = None, end_date: Optional[str] = None, @@ -83,8 +84,9 @@ def liquidity_factors(frequency: str = "M", url += '-/media/research/famamiller/data/liq_data_1962_2022.txt' if frequency.lower() != 'm': + err_msg = "Frequency must be 'm'." print('Liquidity factors are only available for monthly frequency.') - raise ValueError("Frequency must be 'm'.") + raise ValueError(err_msg) # Get .csv here... data = get_file_from_url(url) @@ -123,9 +125,9 @@ def mispricing_factors(frequency: str = "M", output: Optional[str] = None) -> pd.DataFrame: """Retrieve the Stambaugh-Yuan mispricing factors. Daily and monthly.""" if frequency.lower() not in ["d", "m"]: - print("Mispricing factors are only available for daily and monthly \ - frequency.") - raise ValueError("Frequency must be 'd' or 'm'.") + error_msg = "Mispricing factors are only available for daily and\ + monthly frequency." + raise ValueError(error_msg) return None file = "M4d" if frequency == "d" else "M4" @@ -213,21 +215,21 @@ def dhs_factors(frequency: str = "M", frequency = frequency.lower() base_url = "https://docs.google.com/spreadsheets/d/" - if frequency.lower() == "m": - file = "1RxYLbCfk19m8fnniiJYfaj3yI55ZPaoi/export?format=xlsx" - elif frequency.lower() == "d": - file = "1KnCP-NVhf2Sni8bVFIVyMxW-vIljBOWE/export?format=xlsx" + if frequency == "m": + sheet = "1RxYLbCfk19m8fnniiJYfaj3yI55ZPaoi/export?format=xlsx" + elif frequency == "d": + sheet = "1KnCP-NVhf2Sni8bVFIVyMxW-vIljBOWE/export?format=xlsx" else: - print("Frequency must be either 'M' (monthly) or 'D' (daily).") - raise ValueError("Frequency must be 'M' or 'D'.") - # TODO: use the link to the Google Sheet instead of the actual sheet. + error_message = "Frequency must be 'm' or 'd' for the DHS factors'." + print(error_message) + raise ValueError(error_message) - url = base_url + file + url = base_url + sheet response = requests.get(url, verify=True, timeout=20) - file = BytesIO(response.content) + content = BytesIO(response.content) - data = pd.read_excel(file, index_col="Date", + data = pd.read_excel(content, index_col="Date", usecols=['Date', 'FIN', 'PEAD'], engine='openpyxl', header=0, parse_dates=False) data.index.name = "date" @@ -259,11 +261,12 @@ def icr_factors(frequency: str = "M", """Retrieve the He, Kelly, Manela (2017) ICR factors. * Daily since 1999-05-03; quarterly and monthly since 1970. """ - # TODO: Do we need Mkt-RF and RF [seen reffered to as 2-factor model]? + # TODO: Do we need Mkt-RF and RF [seen referred to as 2-factor model. Also liq doesnt have mkt-rf or rf]? # noqa frequency = frequency.lower() if frequency not in ["d", "m", "q"]: - raise ValueError("Frequency must be 'd', 'm' or 'q'.") + err_msg = "Frequency must be 'd', 'm' or 'q'." + raise ValueError(err_msg) base_url = "https://voices.uchicago.edu/zhiguohe" file = {"d": "daily", "m": "monthly", "q": "quarterly"}.get(frequency) @@ -323,19 +326,22 @@ def carhart_factors(frequency: str = "M", # =========================== EXPERIMENTAL ================================== # -# Create a cache with a TTL (time-to-live) of one day -cache = cachetools.TTLCache(maxsize=100, ttl=86400) +cache_dir = Path('~/.cache/getfactormodels/aqr/hml_devil').expanduser() +cache_dir.mkdir(parents=True, exist_ok=True) +cache = dc.Cache(cache_dir) -def _download_hml_devil(frequency): - base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/' - file = 'daily' if frequency.lower() == 'd' else 'monthly' - url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' - print('Downloading HML Devil factors from AQR... This can take a while. Please be patient or something.') # noqa: E501 +def _aqr_download_data(url: str) -> pd.DataFrame: + """Download the data from the given URL.""" + print('Downloading data... This can take a while. Please be patient.') response = requests.get(url, verify=True, timeout=180) xls = pd.ExcelFile(BytesIO(response.content)) + return xls + +def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: + """Process the downloaded data.""" sheets = {0: 'HML Devil', 4: 'MKT', 5: 'SMB', 7: 'UMD', 8: 'RF'} dfs = [] @@ -348,45 +354,23 @@ def _download_hml_devil(frequency): for sheet_index, sheet_name in sheets.items(): df = df_dict[sheet_name] - - df = df[['USA']] if sheet_index != 8 else df.iloc[:, 0:1] - + df = df[['USA']] if sheet_index != 8 else df.iloc[:, 0:1] # noqa df.columns = [sheet_name] dfs.append(df) data = pd.concat(dfs, axis=1) - data.rename(columns={'MKT': 'Mkt-RF', - 'HML Devil': 'HML_DEVIL'}, inplace=True) - data = data.astype(float) - - return data - - -def _get_hml_devil(frequency='M', - start_date: Optional[str] = None, - end_date: Optional[str] = None, - output: Optional[str] = None, - series=False) -> Union[pd.Series, pd.DataFrame]: - - data = _download_hml_devil(frequency) - - data.index.name = 'date' - data.index = pd.to_datetime(data.index) - if frequency.lower() == 'd': - data = data.dropna() + data = data.dropna(subset=['RF', 'UMD']) - if series: - return _process(data, start_date, end_date, filepath=output).HML_DEVIL + data = data.astype(float) - return _process(data, start_date, end_date, filepath=output) + return data -def hml_devil_factors(frequency='M', - start_date: Optional[str] = None, +def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, end_date: Optional[str] = None, output: Optional[str] = None, - series=False) -> Union[pd.Series, pd.DataFrame]: + series: bool = False) -> Union[pd.Series, pd.DataFrame]: """***EXPERIMENTAL*** Retrieve the HML Devil factors from AQR.com. [FIXME: Slow.] @@ -407,23 +391,31 @@ def hml_devil_factors(frequency='M', pd.DataFrame: the HML Devil model data indexed by date. pd.Series: the HML factor as a pd.Series """ - # Use the current date as a cache key - current_date = datetime.date.today() - cache_key = (frequency, None, None, None, None, current_date) - - # If the result is in the cache, return it if not saving - if cache_key in cache: - result = cache[cache_key] - if end_date: - end_date = pd.to_datetime(end_date) - result = result.loc[result.index <= end_date] - - return _process(result, start_date, end_date, filepath=output) - - # Otherwise, compute the result and store it in the cache - data = _get_hml_devil(frequency, start_date, end_date, output, series) - cache[cache_key] = data - return _process(data, start_date, end_date, filepath=output) + base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/' + file = 'daily' if frequency.lower() == 'd' else 'monthly' + url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' + + current_date = datetime.date.today().strftime('%Y-%m-%d') + cache_key = ('hmld', frequency, None, None, None, None, current_date, + end_date) + + # Check if the data is in the cache + data, cached_end_date = cache.get(cache_key, default=(None, None)) + if data is not None and (end_date is None or end_date <= cached_end_date): + # Use it if it is and the end date is the same or earlier + return data + + xls = _aqr_download_data(url) + + # Process the downloaded data + data = _aqr_process_data(xls) + data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_Devil'}, + inplace=True) + + # Store the processed data in the cache + cache[cache_key] = (data, end_date) # TTL is set here + + return data def barillas_shanken_factors(frequency: str = 'M', @@ -448,15 +440,14 @@ def barillas_shanken_factors(frequency: str = 'M', ff = ff_factors(model='6', frequency=frequency)[['Mkt-RF', 'SMB', 'UMD', 'RF']] - df = pd.merge(q, ff, left_index=True, right_index=True, how='inner') + df = q.merge(ff, left_index=True, right_index=True, how='inner') hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date, - series=True) - - hml_devil = hml_devil.rename('HML_m') + series=True)[['HML Devil']] + hml_devil.index.name = 'date' - df = pd.merge(df, hml_devil, left_index=True, - right_index=True, how='inner') + hml_devil = hml_devil.rename(columns={'HML Devil': 'HML_m'}) + df = df.merge(hml_devil, left_index=True, right_index=True, how='inner') return _process(df, start_date, end_date, filepath=output) diff --git a/getfactormodels/utils/cli.py b/getfactormodels/utils/cli.py index c785b88..6846599 100644 --- a/getfactormodels/utils/cli.py +++ b/getfactormodels/utils/cli.py @@ -2,7 +2,7 @@ import argparse -def parse_args(): +def parse_args() -> argparse.Namespace: """Argument parser, allowing for command line arguments. This is the function used in pyproject.toml to run the CLI.""" parser = argparse.ArgumentParser( @@ -21,8 +21,10 @@ def parse_args(): help='The start date for the data.') parser.add_argument('-e', '--end', type=str, required=False, help='The end date for the data.') - parser.add_argument('-o', '--output', type=str, required=False, # noqa + parser.add_argument('-o', '--output', type=str, required=False, help='The file to save the data to.') parser.add_argument('--no_rf', '--no-rf', '--norf', action='store_true', help='Drop the RF column from the DataFrame.') + parser.add_argument('--no_mkt', '--no-mkt', '--nomkt', action='store_true', + help='Drop the Mkt-RF column from the DataFrame.') return parser.parse_args() diff --git a/getfactormodels/utils/utils.py b/getfactormodels/utils/utils.py index 9cb928c..d52396b 100644 --- a/getfactormodels/utils/utils.py +++ b/getfactormodels/utils/utils.py @@ -20,7 +20,7 @@ "liquidity": r"^(il)?liq(uidity)?|(pastor|ps|sp)$", "icr": r"\bicr|hkm\b", "dhs": r"^(\bdhs\b|behav.*)$", - "hml_devil": r"\bhml(_)?d(evil)?\b", + "hml_devil": r"\bhml(_)?d(evil)?|hmld\b", "barillas_shanken": r"\b(bs|bs6|barillas|shanken)\b", }) @@ -80,7 +80,8 @@ def _save_to_file(data, filename=None, output_dir=None): '.md': data.to_markdown, } if filename is None: - filename = datetime.now().strftime('%Y_%m_%d-%H%M') + '.csv' + filename = datetime.now().strftime('%Y_%m_%d-%H%M') \ + + '.csv' elif '.' not in filename: filename += '.csv' @@ -155,7 +156,10 @@ def _slice_dates(data, start_date=None, end_date=None): return data.loc[slice(start_date, end_date)] -def _process(data, start_date=None, end_date=None, filepath=None): +def _process(data: pd.DataFrame, + start_date: str = None, + end_date: str = None, + filepath: str = None) -> pd.DataFrame: """Process the data and optionally save it to a file. Note: the `filepath` takes a filename, path or directory. """ diff --git a/noxfile.py b/noxfile.py index 8c4db6e..ac13212 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,4 +1,5 @@ import nox + # TODO: mypy diff --git a/pyproject.toml b/pyproject.toml index 3799467..5c6ed10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "getfactormodels" dynamic = ["version"] description = "Retrieve data for various multifactor asset pricing models." -authors = [{name = "S. Martin", email = "x512@pm.me"}] +authors = [{name = "S. Martin", email = "x512@pm.me"}, ] license = {file = "LICENSE"} readme = "README.md" keywords = ['finance', 'pricing models', 'financial analysis', 'econometrics', @@ -12,7 +12,12 @@ classifiers = [ "Topic :: Office/Business :: Financial :: Investment", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Scientific/Engineering :: Information Analysis", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", "Operating System :: OS Independent", "Environment :: Console", @@ -21,8 +26,8 @@ classifiers = [ "Development Status :: 2 - Pre-Alpha" ] requires-python = ">=3.7" # Will lower soon -dependencies = [ "numpy >=1.18.5", - "pandas >=1.4", +dependencies = [ "pandas >=1.4", + "numpy >=1.18.5", "requests >=2.20.0", "pyarrow >=14.0.1", "openpyxl >=3.0.3", @@ -30,8 +35,8 @@ dependencies = [ "numpy >=1.18.5", "cachetools==5.3.2" ] [project.optional-dependencies] -dev = ["flit>=3.2,<=3.9", "ruff==0.1.6", "pytest-cov", "pytest>=7.0", "isort", - "pytest-randomly", "nox==2023.4.22"] +dev = ["flit>=3.2,<=3.9", "ruff>=0.1.6", "pytest-cov", "pytest>=7.0", + "isort>=5.12", "pytest-randomly", "nox==2023.4.22"] [project.urls] "Homepage" = "https://github.com/x512/getfactormodels" @@ -51,7 +56,7 @@ lines_between_sections = false [tool.ruff] line-length = 79 -target-version = "py38" +target-version = "py312" indent-width = 4 respect-gitignore = true @@ -62,32 +67,35 @@ skip-magic-trailing-comma = false line-ending = "auto" # Some rules in preview, enable them: -# https://docs.astral.sh/ruff/settings/#format-preview -preview = true # enabled because of: E241 +## docs.astral.sh/ruff/settings/#format-preview +preview = true [tool.ruff.lint] exclude = [".git", ".git-rewrite",] # run `ruff linter` to see all available rules ## see: docs.astral.sh/ruff/rules/ select = ["E4", "E7", "E9", "F", "B", - "DTZ", "W2", "W5", "N", + "DTZ", "W2", "W5", "N", "PL", "NPY", "SIM", "TID", "PD", "E241", "S", "PTH", "RUF", - "FIX001", "FIX002", "TD004", - "TD005", "TD007", "E501", "E261"] + "FIX001", "TD004", "EM", + "TD005", "TD007", "E501", + "FA", "FIX002", "ANN", # fix + "E261"] # requires '--preview' # "TCH" (flake 8 typechecking) # "C901" (McCabe complexity) + # "CPY" Copyright notices # Undo soon: -ignore = ["PD901",] # generic variable name `df` for DataFrames +ignore = ["PD901",] # TODO: fix all generic variable name `df` for DataFrames # see: docs.astral.sh/ruff/configuration/ fixable = ["W29", "W5", "E241", "E261"] unfixable = ["B", "FIX001", "FIX002", "UP"] [tool.ruff.per-file-ignores] -"main.py" = ["UP007"] "**/__init__.py" = ["F401"] +"__main__.py" = ["F401"] # the function names are constructed with a key [tool.coverage.run] omit = ['tests/*']