From 585a2d33d4d1482522be521331b140fa43381704 Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 19:52:13 +1100 Subject: [PATCH 01/17] `drop_rf()` calls `get_factors()` if needed. ``README.md`` cleanup. --- .gitignore | 8 +- README.md | 207 +++++++++++++++++++----------------- getfactormodels/__main__.py | 5 + pyproject.toml | 7 +- 4 files changed, 120 insertions(+), 107 deletions(-) diff --git a/.gitignore b/.gitignore index 9bc9200..378607e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,16 +28,12 @@ lib64/ MANIFEST sdist/ var/ -venv.bak/ -venv/ +*venv.bak/ +*venv/ wheels/ - .nox/ -.tox/ .vscode/ .ruff_cache .cache/ -*_venv/ - **/*.csv **/*.xlsx \ No newline at end of file diff --git a/README.md b/README.md index 54a86f0..69f6d58 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ # getfactormodels ![Python 3.11](https://img.shields.io/badge/Python-3.7+-306998.svg?logo=python&logoColor=ffde57&style=flat-square) ![PyPI - Version](https://img.shields.io/pypi/v/getfactormodels?style=flat-square&label=PyPI) +![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) Reliably retrieve data for various multi-factor asset pricing models. @@ -14,7 +15,7 @@ Reliably retrieve data for various multi-factor asset pricing models. - Pastor and Stambaugh's liquidity factors [[5]](#5) - Mispricing factors of Stambaugh and Yuan[[6]](#6) - The $q$*-factor* model of Hou, Mo, Xue and Zhang[[7]](#7) -- The augmented $q^5$*-factor* model of Hou, Mo, Xue and Zhang[[8]](#8) +- The augmented $q^5$*-factor* model of Hou, Xue and Zhang[[8]](#8) - *Intermediary Capital Ratio* (ICR) of He, Kelly & Manela[[9]](#9) - The *DHS behavioural factors* of Daniel, Hirshleifer & Sun[[10]](#10) - The *HML* $^{DEVIL}$ factor of Asness & Frazzini[[11]](#11) @@ -27,141 +28,134 @@ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Ma `getfactormodels` requires Python ``>=3.7`` -* Install with pip: - ```shell - pip install getfactormodels +* The easiest way to install getfactormodels is via pip: + + ``` + $ pip install getfactormodels ``` ## Usage ->[!WARNING] ->Please be aware that `getfactormodels` was recently released (Dec 20, 2023) and is not stable while this message is displayed. -> -#### Python +>[!IMPORTANT] +>``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything. -After installing, import ``getfactormodels`` and call ``get_factors()`` with the ``model`` and ``frequency`` parameters. Optionally, specify a ``start_date`` and ``end_date`` -* For example, to retrieve the daily q-factor model data: +After installation, import and call the ``get_factors()`` function with the ``model`` and ``frequency`` params: - ```py - import getfactormodels - - getfactormodels.get_factors(model='q', frequency='d') - ``` - > _Trimmed output:_ - ```txt - > df - Mkt-RF R_ME R_IA R_ROE R_EG RF - date - 1967-01-03 0.000778 0.004944 0.001437 -0.007118 -0.008563 0.000187 - 1967-01-04 0.001667 -0.003487 -0.000631 -0.002044 -0.000295 0.000187 - 1967-01-05 0.012990 0.004412 -0.005688 0.000838 -0.003075 0.000187 - 1967-01-06 0.007230 0.006669 0.008897 0.003603 0.002669 0.000187 - 1967-01-09 0.008439 0.006315 0.000331 0.004949 0.002979 0.000187 - ... ... ... ... ... ... ... - 2022-12-23 0.005113 -0.001045 0.004000 0.010484 0.003852 0.000161 - 2022-12-27 -0.005076 -0.001407 0.010190 0.009206 0.003908 0.000161 - 2022-12-28 -0.012344 -0.004354 0.000133 -0.010457 -0.004953 0.000161 - 2022-12-29 0.018699 0.008568 -0.008801 -0.012686 -0.002162 0.000161 - 2022-12-30 -0.002169 0.001840 0.001011 -0.004151 -0.003282 0.000161 - - [14096 rows x 6 columns] - ``` - - * or, retreive the monthly liquidity factors of Pastor and Stambaugh for the 1990s: - - ```py - import getfactormodels as getfactormodels - - df = getfactormodels.get_factors(model='liquidity', frequency='m', start_date='1990-01-01', end_date='1999-12-31') - ``` - > If you don't have time to type `liquidity`, type `liq`, or `ps`--there's a handy regex. +* For example, retrieving the monthly ${q}^{5}$ factor model: + + ```py + import getfactormodels + + data = getfactormodels.get_factors(model='q', frequency='m') + ``` - * or, saving the monthly 3-factor model of Fama & French to a file: + > _Trimmed output:_ + + ```txt + print(data) + Mkt-RF R_ME R_IA R_ROE R_EG RF + date + 1967-01-03 0.000778 0.004944 0.001437 -0.007118 -0.008563 0.000187 + 1967-01-04 0.001667 -0.003487 -0.000631 -0.002044 -0.000295 0.000187 + 1967-01-05 0.012990 0.004412 -0.005688 0.000838 -0.003075 0.000187 + 1967-01-06 0.007230 0.006669 0.008897 0.003603 0.002669 0.000187 + 1967-01-09 0.008439 0.006315 0.000331 0.004949 0.002979 0.000187 + ... ... ... ... ... ... ... + 2022-12-23 0.005113 -0.001045 0.004000 0.010484 0.003852 0.000161 + 2022-12-27 -0.005076 -0.001407 0.010190 0.009206 0.003908 0.000161 + 2022-12-28 -0.012344 -0.004354 0.000133 -0.010457 -0.004953 0.000161 + 2022-12-29 0.018699 0.008568 -0.008801 -0.012686 -0.002162 0.000161 + 2022-12-30 -0.002169 0.001840 0.001011 -0.004151 -0.003282 0.000161 + + [14096 rows x 6 columns] + ``` - ```py - import getfactormodels as gfm +* Retrieving the daily data for the Fama-French 3-factor model, since `start_date`: - df = gfm.get_factors(model='ff3', frequency='m', output="ff3_data.csv") - ``` - >The output parameter accepts a filename, path or directory, and can be one of csv, md, txt, xlsx, pkl. + ```py + import getfactormodels as gfm -* You can also import just the models that you need.: + df = gfm.get_factors(model='ff3', frequency='d', start_date=`2006-01-01`) + ``` - * For example, to import only the *ICR* and *q*-factor models: +* Retrieving data for Stambaugh and Yuan's monthly *Mispricing* factors, between `start_date` and `end_date`, and saving the data to a file: - ```py - from getfactormodels import icr_factors, q_factors + ```py + import getfactormodels as gfm + + df = gfm.get_factors(model='mispricing', start_date='1970-01-01', end_date=1999-12-31, output='mispricing_factors.csv') + ``` - # Passing a model function with no params defaults to monthly. - df = icr_factors() + >``output`` can be a filename, directory, or path. If no extension is specified, defaults to .csv (can be one of: .xlsx, .csv, .txt, .pkl, .md) - # The 'q' models, and the 3-factor model of Fama-French also have weekly data. - df = q_factors(frequency="W", start_date="1992-01-01) - ``` +You can import only the models that you need: - * If using ``ff_factors()``, then an additional ``model`` parameter should be specified: +* For example, to import only the *ICR* and *q-factor* models: - ```py - from getfactormodels import ff_factors + ```py + from getfactormodels import icr_factors, q_factors - # To get annual data for the 5-factor model: - data = ff_factors(model="5", frequency="Y", output=".xlsx") + # Passing a model function without params defaults to monthly data. + df = icr_factors() - # Daily 3-factor model data, since 1970 (not specifying an end date - # will return data up until today): - data = ff_factors(model="3", frequency="D", start_date="1970-01-01") - ``` - > Output allows just an extension to be specified. + # The 'q' models, and the 3-factor model of Fama-French have weekly data available: + df = q_factors(frequency="W", start_date="1992-01-01, output='.xlsx') + ``` -* or import all the models: + >``output`` allows just a file extension (with the `.`, else it'll be passed as a filename). - ```py - from getfactormodels.models import models +* When using `ff_factors()`, specify an additional `model` parameter (**this might be changed**): - df = models.barillas_shanken_factors('m') + ```py + # To get annual data for the 5-factor model: + data = ff_factors(model="5", frequency="Y", output=".xlsx") + + # Daily 3-factor model data, since 1970 (not specifying an end date + # will return data up until today): + data = ff_factors(model="3", frequency="D", start_date="1970-01-01") ``` -* There's also the `FactorExtractor` class that the CLI uses (it doesn't really do a whole lot yet): - ```python - from getfactormodels import FactorExtractor +There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly used by the CLI; lots to do): + ``` + from getfactormodels import FactorExtractor - fe = FactorExtractor(model='carhart', frequency='m', start_date='1980-01-01', end_date='1980-05-01') - fe.get_factors() - fe.to_file('carhart_factors.md') - ``` + fe = FactorExtractor(model='carhart', start_date='1980-01-01', end_date='1980-05-01) + fe.get_factors() + fe.drop_rf() + fe.to_file('~/carhart_factors.md') + ``` - * _The resulting ``carhart_factors.md`` file will look like this:_ +* _The resulting ``carhart_factors.md`` file will look like this:_ - | date | Mkt-RF | SMB | HML | MOM | RF | - |:--------------------|---------:|--------:|--------:|--------:|-------:| - | 1980-01-31 00:00:00 | 0.0551 | 0.0162 | 0.0175 | 0.0755 | 0.008 | - | 1980-02-29 00:00:00 | -0.0122 | -0.0185 | 0.0061 | 0.0788 | 0.0089 | - | 1980-03-31 00:00:00 | -0.129 | -0.0664 | -0.0101 | -0.0955 | 0.0121 | - | 1980-04-30 00:00:00 | 0.0397 | 0.0105 | 0.0106 | -0.0043 | 0.0126 | + | date | Mkt-RF | SMB | HML | MOM | + |:--------------------|---------:|--------:|--------:|--------:| + | 1980-01-31 00:00:00 | 0.0551 | 0.0162 | 0.0175 | 0.0755 | + | 1980-02-29 00:00:00 | -0.0122 | -0.0185 | 0.0061 | 0.0788 | + | 1980-03-31 00:00:00 | -0.129 | -0.0664 | -0.0101 | -0.0955 | + | 1980-04-30 00:00:00 | 0.0397 | 0.0105 | 0.0106 | -0.0043 | -#### Using the CLI -* You can also use getfactormodels from the command line. +### CLI +``bash >=4.2`` +* You can also use getfactormodels from the command line. It's very barebones, here's the `-h` (there is no `--help` yet) - ```bash - $ getfactormodels -h + ```bash + $ getfactormodels -h - usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] - ``` + usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] + ``` * An example of how to use the CLI to retrieve the Fama-French 3-factor model data: - ```bash - getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output "filename.csv" - ``` - > Accepted file extensions are .csv, .txt, .xlsx, and .md. If no extension is given, the output file will be .csv. The --output flag allows a filename, filepath or a directory. If only an extension is provided (including the . else it'll be passed as a filename), a name will be generated. - -* Here's another example that retrieves the annual Fama-French 5-factor data without the RF column: + ```bash + $ getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output ".csv" + ``` + +* Here's another example that retrieves the annual Fama-French 5-factor data without the RF column (using ``--no_rf``) ```sh - getfactormodels -m 5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx + $ getfactormodels -m ff5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx ``` - > `--no_rf` will return the factor model without an RF column. ## References 1. E. F. Fama and K. R. French, ‘Common risk factors in the returns on stocks and bonds’, *Journal of Financial Economics*, vol. 33, no. 1, pp. 3–56, 1993. [PDF](https://people.duke.edu/~charvey/Teaching/BA453_2006/FF_Common_risk.pdf) @@ -200,3 +194,16 @@ After installing, import ``getfactormodels`` and call ``get_factors()`` with the [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat-square&labelColor=ef8336)](https://pycqa.github.io/isort/) [![Ruff](https://img.shields.io/badge/-ruff-%23261230?style=flat-square&logo=ruff&logoColor=d7ff64)](https://simpleicons.org/?q=ruff) +--- + +#### Known issues + +* The first `hml_devil_factors()` retrieval is slow, because the download from aqr.com is slow. It's the only model, so far, implementing a cache—daily data expires at the end of the day and is only re-downloaded when the requested`end_date` exceeds the file's last index date; similar for monthly, expiring at EOM and re-downloaded when needed. + + +#### Todo + +- [ ] Docs + - [ ] Examples +- [ ] Tests +- Error handling \ No newline at end of file diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index e54ceb4..c8cf40e 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -58,6 +58,7 @@ def get_factors(model: str = "3", raise ValueError(f"Invalid model: {model}") df = function(frequency, start_date, end_date, output) + return df @@ -125,6 +126,10 @@ def get_factors(self) -> pd.DataFrame: def drop_rf(self, df): """Drop the ``RF`` column from the DataFrame.""" + # get_factors if not already done + if df is None: + df = self.get_factors() + if "RF" in df.columns: df = df.drop(columns=["RF"]) else: diff --git a/pyproject.toml b/pyproject.toml index 3799467..c2043f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,12 @@ classifiers = [ "Topic :: Office/Business :: Financial :: Investment", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Scientific/Engineering :: Information Analysis", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", "Operating System :: OS Independent", "Environment :: Console", @@ -79,7 +84,7 @@ select = ["E4", "E7", "E9", "F", "B", # "C901" (McCabe complexity) # Undo soon: -ignore = ["PD901",] # generic variable name `df` for DataFrames +ignore = ["PD901",] # TODO: fix all generic variable name `df` for DataFrames # see: docs.astral.sh/ruff/configuration/ fixable = ["W29", "W5", "E241", "E261"] From c2857aaa79e4a79d96b39fa26c628854ed6a400c Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 19:58:58 +1100 Subject: [PATCH 02/17] fix: syntax highlights --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 69f6d58..c690a74 100644 --- a/README.md +++ b/README.md @@ -30,20 +30,20 @@ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Ma * The easiest way to install getfactormodels is via pip: - ``` + ```shell $ pip install getfactormodels ``` ## Usage >[!IMPORTANT] ->``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything. +>![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square)``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything.![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) After installation, import and call the ``get_factors()`` function with the ``model`` and ``frequency`` params: * For example, retrieving the monthly ${q}^{5}$ factor model: - ```py + ```python import getfactormodels data = getfactormodels.get_factors(model='q', frequency='m') @@ -72,7 +72,7 @@ After installation, import and call the ``get_factors()`` function with the ``mo * Retrieving the daily data for the Fama-French 3-factor model, since `start_date`: - ```py + ```python import getfactormodels as gfm df = gfm.get_factors(model='ff3', frequency='d', start_date=`2006-01-01`) @@ -80,7 +80,7 @@ After installation, import and call the ``get_factors()`` function with the ``mo * Retrieving data for Stambaugh and Yuan's monthly *Mispricing* factors, between `start_date` and `end_date`, and saving the data to a file: - ```py + ```python import getfactormodels as gfm df = gfm.get_factors(model='mispricing', start_date='1970-01-01', end_date=1999-12-31, output='mispricing_factors.csv') @@ -92,7 +92,7 @@ You can import only the models that you need: * For example, to import only the *ICR* and *q-factor* models: - ```py + ```python from getfactormodels import icr_factors, q_factors # Passing a model function without params defaults to monthly data. @@ -106,7 +106,7 @@ You can import only the models that you need: * When using `ff_factors()`, specify an additional `model` parameter (**this might be changed**): - ```py + ```python # To get annual data for the 5-factor model: data = ff_factors(model="5", frequency="Y", output=".xlsx") @@ -117,7 +117,7 @@ You can import only the models that you need: There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly used by the CLI; lots to do): - ``` + ```python from getfactormodels import FactorExtractor fe = FactorExtractor(model='carhart', start_date='1980-01-01', end_date='1980-05-01) @@ -140,20 +140,20 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly ``bash >=4.2`` * You can also use getfactormodels from the command line. It's very barebones, here's the `-h` (there is no `--help` yet) - ```bash + ```shell $ getfactormodels -h usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] ``` * An example of how to use the CLI to retrieve the Fama-French 3-factor model data: - ```bash + ```shell $ getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output ".csv" ``` * Here's another example that retrieves the annual Fama-French 5-factor data without the RF column (using ``--no_rf``) - ```sh + ```shell $ getfactormodels -m ff5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx ``` From f8083d77945b875d67bc14c181d12b7925515f0d Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 20:02:20 +1100 Subject: [PATCH 03/17] clean --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c690a74..1e6b2a2 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,8 @@ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Ma ## Usage >[!IMPORTANT] ->![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square)``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything.![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) +>![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) +>``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything. After installation, import and call the ``get_factors()`` function with the ``model`` and ``frequency`` params: @@ -115,8 +116,8 @@ You can import only the models that you need: data = ff_factors(model="3", frequency="D", start_date="1970-01-01") ``` - There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly used by the CLI; lots to do): + ```python from getfactormodels import FactorExtractor @@ -137,7 +138,9 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly ### CLI + ``bash >=4.2`` + * You can also use getfactormodels from the command line. It's very barebones, here's the `-h` (there is no `--help` yet) ```shell @@ -147,6 +150,7 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly ``` * An example of how to use the CLI to retrieve the Fama-French 3-factor model data: + ```shell $ getfactormodels --model ff3 --frequency M --start-date 1960-01-01 --end-date 2020-12-31 --output ".csv" ``` From 25be8323f3dbc3cfd44ee1f20340ca9e7276ae84 Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 22:15:26 +1100 Subject: [PATCH 04/17] few fixes --- README.md | 13 +++++++++---- getfactormodels/models/models.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1e6b2a2..8cec408 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Reliably retrieve data for various multi-factor asset pricing models. - Pastor and Stambaugh's liquidity factors [[5]](#5) - Mispricing factors of Stambaugh and Yuan[[6]](#6) - The $q$*-factor* model of Hou, Mo, Xue and Zhang[[7]](#7) -- The augmented $q^5$*-factor* model of Hou, Xue and Zhang[[8]](#8) +- The augmented $q^5$*-factor* model of Hou, Mo, Xue and Zhang[[8]](#8) - *Intermediary Capital Ratio* (ICR) of He, Kelly & Manela[[9]](#9) - The *DHS behavioural factors* of Daniel, Hirshleifer & Sun[[10]](#10) - The *HML* $^{DEVIL}$ factor of Asness & Frazzini[[11]](#11) @@ -38,6 +38,7 @@ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Ma >[!IMPORTANT] >![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) + >``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything. After installation, import and call the ``get_factors()`` function with the ``model`` and ``frequency`` params: @@ -53,7 +54,7 @@ After installation, import and call the ``get_factors()`` function with the ``mo > _Trimmed output:_ ```txt - print(data) + > print(data) Mkt-RF R_ME R_IA R_ROE R_EG RF date 1967-01-03 0.000778 0.004944 0.001437 -0.007118 -0.008563 0.000187 @@ -161,6 +162,10 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly $ getfactormodels -m ff5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx ``` +## Data Availability + +>[TODO] + ## References 1. E. F. Fama and K. R. French, ‘Common risk factors in the returns on stocks and bonds’, *Journal of Financial Economics*, vol. 33, no. 1, pp. 3–56, 1993. [PDF](https://people.duke.edu/~charvey/Teaching/BA453_2006/FF_Common_risk.pdf) 2. M. Carhart, ‘On Persistence in Mutual Fund Performance’, *Journal of Finance*, vol. 52, no. 1, pp. 57–82, 1997. [PDF](https://onlinelibrary.wiley.com/doi/full/10.1111/j.1540-6261.1997.tb03808.x) @@ -202,7 +207,7 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly #### Known issues -* The first `hml_devil_factors()` retrieval is slow, because the download from aqr.com is slow. It's the only model, so far, implementing a cache—daily data expires at the end of the day and is only re-downloaded when the requested`end_date` exceeds the file's last index date; similar for monthly, expiring at EOM and re-downloaded when needed. +* The first `hml_devil_factors()` retrieval is slow, because the download from aqr.com is slow. It's the only model, so far, implementing a cache—daily data expires at the end of the day and is only re-downloaded when the requested`end_date` exceeds the file's last index date. Similar for monthly, expiring at EOM and re-downloaded when needed. #### Todo @@ -210,4 +215,4 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly - [ ] Docs - [ ] Examples - [ ] Tests -- Error handling \ No newline at end of file +- [ ] Error handling \ No newline at end of file diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index 341fece..87fe895 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -37,7 +37,7 @@ # TODO: "PEP 484 prohibits implicit `Optional`" see: RUFF013. -def ff_factors(model: str = "3", # TODO: fix: _get_ff_factors filepath param +def ff_factors(model: str = "3", frequency: str = "M", start_date: Optional[str] = None, end_date: Optional[str] = None, @@ -356,7 +356,8 @@ def _download_hml_devil(frequency): data = pd.concat(dfs, axis=1) data.rename(columns={'MKT': 'Mkt-RF', - 'HML Devil': 'HML_DEVIL'}, inplace=True) + 'HML Devil': 'HML_DEVIL'}) + data = data.astype(float) return data @@ -422,7 +423,11 @@ def hml_devil_factors(frequency='M', # Otherwise, compute the result and store it in the cache data = _get_hml_devil(frequency, start_date, end_date, output, series) + + # UMD returns NaNs for 1926 + data = data.dropna() cache[cache_key] = data + return _process(data, start_date, end_date, filepath=output) @@ -448,7 +453,7 @@ def barillas_shanken_factors(frequency: str = 'M', ff = ff_factors(model='6', frequency=frequency)[['Mkt-RF', 'SMB', 'UMD', 'RF']] - df = pd.merge(q, ff, left_index=True, right_index=True, how='inner') + df = q.merge(ff, left_index=True, right_index=True, how='inner') hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date, series=True) @@ -456,7 +461,7 @@ def barillas_shanken_factors(frequency: str = 'M', hml_devil = hml_devil.rename('HML_m') hml_devil.index.name = 'date' - df = pd.merge(df, hml_devil, left_index=True, + df = df.merge(hml_devil, left_index=True, right_index=True, how='inner') return _process(df, start_date, end_date, filepath=output) From 29ccee7e07738ef162fc421f5bcbc67debb40383 Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 22:53:22 +1100 Subject: [PATCH 05/17] added: ``.drop_mkt()``, ``--nomkt`` --- getfactormodels/__main__.py | 25 ++++++++++++++++++++++++- getfactormodels/models/ff_models.py | 2 +- getfactormodels/utils/cli.py | 4 +++- getfactormodels/utils/utils.py | 3 ++- 4 files changed, 30 insertions(+), 4 deletions(-) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index c8cf40e..bccb2dc 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -2,6 +2,8 @@ import os import pandas as pd from dateutil import parser +from pathlib import Path + # ruff: noqa: RUF100 from getfactormodels.models.models import (barillas_shanken_factors, # noqa: F401, E501 carhart_factors, dhs_factors, @@ -98,6 +100,10 @@ def no_rf(self): """Sets the _no_rf flag to True.""" self._no_rf = True + def no_mkt(self): + """Sets the _no_mkt flag to True.""" + self._no_mkt = True + @staticmethod def validate_date_format(date_string): """ @@ -121,6 +127,8 @@ def get_factors(self) -> pd.DataFrame: if self._no_rf: self.df = self.drop_rf(self.df) + if self._no_mkt: + self.df = self.drop_mkt(self.df) return self.df @@ -137,6 +145,18 @@ def drop_rf(self, df): return df + def drop_mkt(self, df): + """Drop the ``MKT`` column from the DataFrame.""" + if df is None: + df = self.get_factors() + + if "Mkt-RF" in df.columns: + df = df.drop(columns=["Mkt-RF"]) + else: + print("`drop_mkt` was called but no MKT column was found.") + + return df + def to_file(self, filename): """ Save the factor data to a file. @@ -158,12 +178,15 @@ def main(): start_date=args.start, end_date=args.end) if args.no_rf: extractor.no_rf() + elif args.no_mkt: + extractor.no_mkt() df = extractor.get_factors() if args.output: extractor.to_file(args.output) - print(f'File saved to "{os.path.abspath(args.output)}"') + print(f'File saved to "{Path(args.output).resolve()}"') + else: print(df) diff --git a/getfactormodels/models/ff_models.py b/getfactormodels/models/ff_models.py index 84a9228..5ccdcd4 100644 --- a/getfactormodels/models/ff_models.py +++ b/getfactormodels/models/ff_models.py @@ -21,7 +21,7 @@ """ import numpy as np -import pandas as pd # noqa: D100 +import pandas as pd from ..utils.utils import ( # noqa - todo: fix relative import from parent modules banned _process, get_zip_from_url) diff --git a/getfactormodels/utils/cli.py b/getfactormodels/utils/cli.py index c785b88..bef1654 100644 --- a/getfactormodels/utils/cli.py +++ b/getfactormodels/utils/cli.py @@ -21,8 +21,10 @@ def parse_args(): help='The start date for the data.') parser.add_argument('-e', '--end', type=str, required=False, help='The end date for the data.') - parser.add_argument('-o', '--output', type=str, required=False, # noqa + parser.add_argument('-o', '--output', type=str, required=False, help='The file to save the data to.') parser.add_argument('--no_rf', '--no-rf', '--norf', action='store_true', help='Drop the RF column from the DataFrame.') + parser.add_argument('--no_mkt', '--no-mkt', '--nomkt', action='store_true', + help='Drop the MKT column from the DataFrame.') return parser.parse_args() diff --git a/getfactormodels/utils/utils.py b/getfactormodels/utils/utils.py index 9cb928c..60a78d7 100644 --- a/getfactormodels/utils/utils.py +++ b/getfactormodels/utils/utils.py @@ -80,7 +80,8 @@ def _save_to_file(data, filename=None, output_dir=None): '.md': data.to_markdown, } if filename is None: - filename = datetime.now().strftime('%Y_%m_%d-%H%M') + '.csv' + filename = datetime.now().strftime('%Y_%m_%d-%H%M') \ + + '.csv' elif '.' not in filename: filename += '.csv' From 73b7cebb9e54b7bf713c5d628ea15de826338ba1 Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 23:08:14 +1100 Subject: [PATCH 06/17] fix: forgot ``_no_mkt`` in init --- getfactormodels/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index bccb2dc..1406969 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -94,6 +94,7 @@ def __init__(self, else None self.output = output self._no_rf = False + self._no_mkt = False self.df = None def no_rf(self): From f6b2251e834b6e8049571f0bd5ebd575f058e692 Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 23:13:31 +1100 Subject: [PATCH 07/17] fix: typo --- getfactormodels/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index 1406969..04ad040 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -179,7 +179,7 @@ def main(): start_date=args.start, end_date=args.end) if args.no_rf: extractor.no_rf() - elif args.no_mkt: + if args.no_mkt: extractor.no_mkt() df = extractor.get_factors() From 7a85b3317f4606a90713e0eb0c49ad0c20c30fbc Mon Sep 17 00:00:00 2001 From: x512 Date: Sat, 23 Dec 2023 23:16:08 +1100 Subject: [PATCH 08/17] clean: imports (isort) --- getfactormodels/__main__.py | 9 ++++----- getfactormodels/models/models.py | 1 + noxfile.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index 04ad040..349a693 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -1,12 +1,11 @@ # -*- coding: utf-8 -*- -import os +from pathlib import Path import pandas as pd from dateutil import parser -from pathlib import Path - # ruff: noqa: RUF100 -from getfactormodels.models.models import (barillas_shanken_factors, # noqa: F401, E501 - carhart_factors, dhs_factors, +from getfactormodels.models.models import \ + barillas_shanken_factors # noqa: F401, E501 +from getfactormodels.models.models import (carhart_factors, dhs_factors, ff_factors, hml_devil_factors, icr_factors, liquidity_factors, mispricing_factors, diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index 87fe895..effac33 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -34,6 +34,7 @@ import requests from getfactormodels.utils.utils import _process, get_file_from_url from .ff_models import _get_ff_factors + # TODO: "PEP 484 prohibits implicit `Optional`" see: RUFF013. diff --git a/noxfile.py b/noxfile.py index 8c4db6e..ac13212 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,4 +1,5 @@ import nox + # TODO: mypy From 88b995c9a7165131561af1507534b7d02a20d8fb Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 03:33:06 +1100 Subject: [PATCH 09/17] added typehints, cleaned up. todo: ``hml_devil_factors()`` isn't using cache in cli! --- README.md | 13 +++--- getfactormodels/__init__.py | 6 +-- getfactormodels/__main__.py | 35 +++++++++-------- getfactormodels/models/ff_models.py | 39 +++++++++++------- getfactormodels/models/models.py | 61 ++++++++++++++--------------- getfactormodels/utils/cli.py | 4 +- getfactormodels/utils/utils.py | 5 ++- pyproject.toml | 25 ++++++------ 8 files changed, 103 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 8cec408..8d76048 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ After installation, import and call the ``get_factors()`` function with the ``mo 2022-12-29 0.018699 0.008568 -0.008801 -0.012686 -0.002162 0.000161 2022-12-30 -0.002169 0.001840 0.001011 -0.004151 -0.003282 0.000161 - [14096 rows x 6 columns] + [14096 rows x 6 columns] ``` * Retrieving the daily data for the Fama-French 3-factor model, since `start_date`: @@ -137,17 +137,18 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly | 1980-03-31 00:00:00 | -0.129 | -0.0664 | -0.0101 | -0.0955 | | 1980-04-30 00:00:00 | 0.0397 | 0.0105 | 0.0106 | -0.0043 | +>``.drop_rf()`` will return the DataFrame without the `RF` column. You can also drop the "Mkt-RF" column with ``.drop_mkt()`` ### CLI ``bash >=4.2`` -* You can also use getfactormodels from the command line. It's very barebones, here's the `-h` (there is no `--help` yet) +* You can also use getfactormodels from the command line. It's very barebones, here's the `-h`: ```shell $ getfactormodels -h - usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] + usage: getfactormodels [-h] -m MODEL [-f FREQ] [-s START] [-e END] [-o OUTPUT] [--no_rf] [--no_mkt] ``` * An example of how to use the CLI to retrieve the Fama-French 3-factor model data: @@ -161,7 +162,8 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly ```shell $ getfactormodels -m ff5 -f Y -s 1960-01-01 -e 2020-12-31 --no_rf -o ~/some_dir/filename.xlsx ``` - +* To return the factors without the risk-free rate `RF`, or the excess market return `Mkt-RF`, columns: + ## Data Availability >[TODO] @@ -209,10 +211,9 @@ There's also a ``FactorExtractor`` class (which doesn't do much yet, it's mainly * The first `hml_devil_factors()` retrieval is slow, because the download from aqr.com is slow. It's the only model, so far, implementing a cache—daily data expires at the end of the day and is only re-downloaded when the requested`end_date` exceeds the file's last index date. Similar for monthly, expiring at EOM and re-downloaded when needed. - #### Todo - [ ] Docs - [ ] Examples - [ ] Tests -- [ ] Error handling \ No newline at end of file +- [ ] Error handling diff --git a/getfactormodels/__init__.py b/getfactormodels/__init__.py index 91ca3f1..f7d785e 100644 --- a/getfactormodels/__init__.py +++ b/getfactormodels/__init__.py @@ -10,8 +10,8 @@ # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, @@ -23,7 +23,7 @@ __version__ = "0.0.3" from .__main__ import FactorExtractor, get_factors -from .models import models # noqa: F401 +from .models import models # noqa: F401, RUF100 (silent flake8 in VScode) from .models.models import (barillas_shanken_factors, carhart_factors, dhs_factors, ff_factors, hml_devil_factors, icr_factors, liquidity_factors, mispricing_factors, diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index 349a693..2378e49 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -1,24 +1,26 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- from pathlib import Path import pandas as pd from dateutil import parser # ruff: noqa: RUF100 from getfactormodels.models.models import \ - barillas_shanken_factors # noqa: F401, E501 -from getfactormodels.models.models import (carhart_factors, dhs_factors, + barillas_shanken_factors # noqa: F401 +from getfactormodels.models.models import (carhart_factors, dhs_factors, # noqa: F401, E501 ff_factors, hml_devil_factors, icr_factors, liquidity_factors, mispricing_factors, q_classic_factors, q_factors) from getfactormodels.utils.cli import parse_args from getfactormodels.utils.utils import _get_model_key, _process +from typing import Optional def get_factors(model: str = "3", - frequency: str = "M", - start_date=None, - end_date=None, - output=None) -> pd.DataFrame: + frequency: Optional[str] = "M", + start_date: Optional[str] = None, + end_date: Optional[str] = None, + output: Optional[str] = None) -> pd.DataFrame: """Get data for a specified factor model. Return a DataFrame containing the data for the specified model and @@ -80,11 +82,11 @@ class FactorExtractor: """ def __init__(self, - model='3', - frequency='M', - start_date=None, - end_date=None, - output=None): + model: str = '3', + frequency: Optional[str] = 'M', + start_date: Optional[str] = None, + end_date: Optional[str] = None, + output: Optional[str] = None): self.model: str = model self.frequency: str = frequency self.start_date = self.validate_date_format(start_date) if start_date \ @@ -105,7 +107,7 @@ def no_mkt(self): self._no_mkt = True @staticmethod - def validate_date_format(date_string): + def validate_date_format(date_string: str) -> str: """ Validate the date format. @@ -123,7 +125,8 @@ def get_factors(self) -> pd.DataFrame: model=self.model, frequency=self.frequency, start_date=self.start_date, - end_date=self.end_date) + end_date=self.end_date, + output=self.output) if self._no_rf: self.df = self.drop_rf(self.df) @@ -132,7 +135,7 @@ def get_factors(self) -> pd.DataFrame: return self.df - def drop_rf(self, df): + def drop_rf(self, df: pd.DataFrame = None) -> pd.DataFrame: """Drop the ``RF`` column from the DataFrame.""" # get_factors if not already done if df is None: @@ -145,7 +148,7 @@ def drop_rf(self, df): return df - def drop_mkt(self, df): + def drop_mkt(self, df: pd.DataFrame = None) -> pd.DataFrame: """Drop the ``MKT`` column from the DataFrame.""" if df is None: df = self.get_factors() @@ -157,7 +160,7 @@ def drop_mkt(self, df): return df - def to_file(self, filename): + def to_file(self, filename: str): """ Save the factor data to a file. diff --git a/getfactormodels/models/ff_models.py b/getfactormodels/models/ff_models.py index 5ccdcd4..9b82af7 100644 --- a/getfactormodels/models/ff_models.py +++ b/getfactormodels/models/ff_models.py @@ -20,19 +20,23 @@ models construction. """ +# ruff: noqa: PLR2004 +from __future__ import annotations +from typing import Optional import numpy as np import pandas as pd from ..utils.utils import ( # noqa - todo: fix relative import from parent modules banned _process, get_zip_from_url) -def _ff_construct_url(model="3", frequency="M"): +def _ff_construct_url(model: str = "3", frequency: str = "M") -> str: """Construct and return the URL for the specified model and frequency.""" frequency = frequency.upper() - if frequency == "W" and model not in ["3", "4"]: # why 4? - raise ValueError("Weekly data is only available for the Fama \ - French 3 factor model at the moment.") + if frequency == "W" and model not in ["3", "4"]: + error_message = "Weekly data is only available for the Fama French \ + 3 factor model at the moment." + raise ValueError(error_message) base_url = "https://mba.tuck.dartmouth.edu" ftp = "pages/faculty/ken.french/ftp" @@ -48,7 +52,8 @@ def _ff_construct_url(model="3", frequency="M"): return f"{base_url}/{ftp}/{file}" -def _ff_read_csv_from_zip(zip_file, model=None): +def _ff_read_csv_from_zip(zip_file, + model: Optional[str] = None) -> pd.DataFrame: """Read the FF Factors CSV into a dataframe.""" try: filename = zip_file.namelist()[0] @@ -72,7 +77,8 @@ def _ff_read_csv_from_zip(zip_file, model=None): return data -def _ff_process_data(data, model, frequency) -> pd.DataFrame: +def _ff_process_data(data: pd.DataFrame, + model, frequency) -> pd.DataFrame: """Process and return the data based on the provided model and frequency. """ frequency = frequency.lower() @@ -105,7 +111,7 @@ def _ff_process_data(data, model, frequency) -> pd.DataFrame: return data -def _ff_get_mom(frequency) -> pd.Series: +def _ff_get_mom(frequency: str = "M") -> pd.Series: """Fetch and return the momentum factor data as a pd.Series. * Note: only for returning the raw data for the 4 and 6 factor models. """ @@ -128,8 +134,8 @@ def _ff_get_mom(frequency) -> pd.Series: def _get_ff_factors(model: str = "3", frequency: str = "M", - start_date=None, - end_date=None) -> pd.DataFrame: + start_date: Optional[str] = None, + end_date: Optional[str] = None) -> pd.DataFrame: """Return the Fama French 3, 5, or 6, or Carhart 4 factor model data. * Note: This is the function that's called by get_ff_factors in main. @@ -138,13 +144,16 @@ def _get_ff_factors(model: str = "3", frequency = "M" if frequency.upper() not in ["D", "M", "Y", "W"]: - raise ValueError("Frequency must be one of: D, M, Y, or W.") + err_msg = "Invalid frequency passed to get_ff_factors: " + err_msg += f" Frequency '{frequency}' not in ff_model `{model}`." + raise ValueError(err_msg) + elif model not in ["3", "5", "6", "4"]: - raise ValueError(f"Invalid model passed to private function \ - _get_ff_factors, must be one of: 3, 5, 6, or 4, \ - not {model}. If you see this error message please \ - submit an issue at:\ - https://github.com/x512/getfactormodels/issues/") + err_msg = "Invalid model passed to get_ff_factors, must be one of: " + err_msg += "3, 5, 6, or 4, not {model}." + err_msg += "If you see this error message please submit an issue at:" + err_msg += " https://github.com/x512/getfactormodels/issues/" + raise ValueError(err_msg) url = _ff_construct_url(model, frequency) zip = get_zip_from_url(url) diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index effac33..aba760c 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -25,6 +25,7 @@ - ``barillas_shanken_factors`` relies on ``hml_devil_factors``, so it's also slow. """ +from __future__ import annotations import datetime from io import BytesIO from typing import Optional, Union @@ -35,6 +36,7 @@ from getfactormodels.utils.utils import _process, get_file_from_url from .ff_models import _get_ff_factors + # TODO: "PEP 484 prohibits implicit `Optional`" see: RUFF013. @@ -226,9 +228,9 @@ def dhs_factors(frequency: str = "M", url = base_url + file response = requests.get(url, verify=True, timeout=20) - file = BytesIO(response.content) + content = BytesIO(response.content) - data = pd.read_excel(file, index_col="Date", + data = pd.read_excel(content, index_col="Date", usecols=['Date', 'FIN', 'PEAD'], engine='openpyxl', header=0, parse_dates=False) data.index.name = "date" @@ -328,7 +330,7 @@ def carhart_factors(frequency: str = "M", cache = cachetools.TTLCache(maxsize=100, ttl=86400) -def _download_hml_devil(frequency): +def _download_hml_devil(frequency: str = 'M') -> pd.DataFrame: base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/' file = 'daily' if frequency.lower() == 'd' else 'monthly' url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' @@ -364,31 +366,41 @@ def _download_hml_devil(frequency): return data -def _get_hml_devil(frequency='M', +# TODO: FIXME: HML Devil isn't using cache in cli. see /utils/cli.py probably. MKT -> Mkt-RF also!! Needs to be fixed before a swap-out hml for hm_devil func! # noqa: E501 + +def _get_hml_devil(frequency: str = 'M', start_date: Optional[str] = None, end_date: Optional[str] = None, output: Optional[str] = None, - series=False) -> Union[pd.Series, pd.DataFrame]: + series: bool = False) -> Union[pd.Series, pd.DataFrame]: - data = _download_hml_devil(frequency) + # Use the current date as a cache key + current_date = datetime.date.today() + cache_key = ('hmld', frequency, None, None, None, None, current_date) - data.index.name = 'date' - data.index = pd.to_datetime(data.index) + # Check if the data is in the cache + data = cache.get(cache_key) + if data is not None: + return data - if frequency.lower() == 'd': - data = data.dropna() + # If the data is not in the cache, download it + data = _download_hml_devil() - if series: - return _process(data, start_date, end_date, filepath=output).HML_DEVIL + # Apply transformations to the data + data = data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_DEVIL'}) + data = data.astype(float) - return _process(data, start_date, end_date, filepath=output) + # Store the transformed data in the cache + cache[cache_key] = data + + return data -def hml_devil_factors(frequency='M', +def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, end_date: Optional[str] = None, output: Optional[str] = None, - series=False) -> Union[pd.Series, pd.DataFrame]: + series: bool = False) -> Union[pd.Series, pd.DataFrame]: """***EXPERIMENTAL*** Retrieve the HML Devil factors from AQR.com. [FIXME: Slow.] @@ -409,25 +421,12 @@ def hml_devil_factors(frequency='M', pd.DataFrame: the HML Devil model data indexed by date. pd.Series: the HML factor as a pd.Series """ - # Use the current date as a cache key - current_date = datetime.date.today() - cache_key = (frequency, None, None, None, None, current_date) + data = _get_hml_devil(frequency, start_date, end_date, series=series) - # If the result is in the cache, return it if not saving - if cache_key in cache: - result = cache[cache_key] - if end_date: - end_date = pd.to_datetime(end_date) - result = result.loc[result.index <= end_date] - - return _process(result, start_date, end_date, filepath=output) - - # Otherwise, compute the result and store it in the cache - data = _get_hml_devil(frequency, start_date, end_date, output, series) + data = data.dropna() - # UMD returns NaNs for 1926 + data = data.rename(columns={'MKT': 'Mkt-RF'}) data = data.dropna() - cache[cache_key] = data return _process(data, start_date, end_date, filepath=output) diff --git a/getfactormodels/utils/cli.py b/getfactormodels/utils/cli.py index bef1654..6846599 100644 --- a/getfactormodels/utils/cli.py +++ b/getfactormodels/utils/cli.py @@ -2,7 +2,7 @@ import argparse -def parse_args(): +def parse_args() -> argparse.Namespace: """Argument parser, allowing for command line arguments. This is the function used in pyproject.toml to run the CLI.""" parser = argparse.ArgumentParser( @@ -26,5 +26,5 @@ def parse_args(): parser.add_argument('--no_rf', '--no-rf', '--norf', action='store_true', help='Drop the RF column from the DataFrame.') parser.add_argument('--no_mkt', '--no-mkt', '--nomkt', action='store_true', - help='Drop the MKT column from the DataFrame.') + help='Drop the Mkt-RF column from the DataFrame.') return parser.parse_args() diff --git a/getfactormodels/utils/utils.py b/getfactormodels/utils/utils.py index 60a78d7..b1bb27a 100644 --- a/getfactormodels/utils/utils.py +++ b/getfactormodels/utils/utils.py @@ -156,7 +156,10 @@ def _slice_dates(data, start_date=None, end_date=None): return data.loc[slice(start_date, end_date)] -def _process(data, start_date=None, end_date=None, filepath=None): +def _process(data: pd.DataFrame, + start_date: str = None, + end_date: str = None, + filepath: str = None) -> pd.DataFrame: """Process the data and optionally save it to a file. Note: the `filepath` takes a filename, path or directory. """ diff --git a/pyproject.toml b/pyproject.toml index c2043f0..4c85fd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,8 @@ classifiers = [ "Development Status :: 2 - Pre-Alpha" ] requires-python = ">=3.7" # Will lower soon -dependencies = [ "numpy >=1.18.5", - "pandas >=1.4", +dependencies = [ "pandas >=1.4", + "numpy >=1.18.5", "requests >=2.20.0", "pyarrow >=14.0.1", "openpyxl >=3.0.3", @@ -35,8 +35,8 @@ dependencies = [ "numpy >=1.18.5", "cachetools==5.3.2" ] [project.optional-dependencies] -dev = ["flit>=3.2,<=3.9", "ruff==0.1.6", "pytest-cov", "pytest>=7.0", "isort", - "pytest-randomly", "nox==2023.4.22"] +dev = ["flit>=3.2,<=3.9", "ruff>=0.1.6", "pytest-cov", "pytest>=7.0", + "isort>=5.12", "pytest-randomly", "nox==2023.4.22"] [project.urls] "Homepage" = "https://github.com/x512/getfactormodels" @@ -56,7 +56,7 @@ lines_between_sections = false [tool.ruff] line-length = 79 -target-version = "py38" +target-version = "py312" indent-width = 4 respect-gitignore = true @@ -67,21 +67,24 @@ skip-magic-trailing-comma = false line-ending = "auto" # Some rules in preview, enable them: -# https://docs.astral.sh/ruff/settings/#format-preview -preview = true # enabled because of: E241 +## docs.astral.sh/ruff/settings/#format-preview +preview = true [tool.ruff.lint] exclude = [".git", ".git-rewrite",] # run `ruff linter` to see all available rules ## see: docs.astral.sh/ruff/rules/ select = ["E4", "E7", "E9", "F", "B", - "DTZ", "W2", "W5", "N", + "DTZ", "W2", "W5", "N", "PL", "NPY", "SIM", "TID", "PD", "E241", "S", "PTH", "RUF", - "FIX001", "FIX002", "TD004", - "TD005", "TD007", "E501", "E261"] + "FIX001", "TD004", "EM", + "TD005", "TD007", "E501", + "FA", "FIX002", "ANN", # fix + "E261"] # requires '--preview' # "TCH" (flake 8 typechecking) # "C901" (McCabe complexity) + # "CPY" Copyright notices # Undo soon: ignore = ["PD901",] # TODO: fix all generic variable name `df` for DataFrames @@ -91,8 +94,8 @@ fixable = ["W29", "W5", "E241", "E261"] unfixable = ["B", "FIX001", "FIX002", "UP"] [tool.ruff.per-file-ignores] -"main.py" = ["UP007"] "**/__init__.py" = ["F401"] +"__main__.py" = ["F401"] # the function names are constructed with a key [tool.coverage.run] omit = ['tests/*'] From 1a7a0618178e4200c2701a89185af22448d908e3 Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 05:08:29 +1100 Subject: [PATCH 10/17] fix: ``hml_devil`` using a persistent cache --- getfactormodels/__main__.py | 4 +- getfactormodels/models/models.py | 90 ++++++++++++++------------------ getfactormodels/utils/utils.py | 2 +- 3 files changed, 42 insertions(+), 54 deletions(-) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index 2378e49..bd0c58e 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -129,9 +129,9 @@ def get_factors(self) -> pd.DataFrame: output=self.output) if self._no_rf: - self.df = self.drop_rf(self.df) + self.df = self.drop_rf(self.df.copy()) # create a copy before drop -- use cache. if self._no_mkt: - self.df = self.drop_mkt(self.df) + self.df = self.drop_mkt(self.df.copy()) return self.df diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index aba760c..4eb50b7 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -29,7 +29,8 @@ import datetime from io import BytesIO from typing import Optional, Union -import cachetools +import diskcache as dc +import os import numpy as np import pandas as pd import requests @@ -326,19 +327,20 @@ def carhart_factors(frequency: str = "M", # =========================== EXPERIMENTAL ================================== # -# Create a cache with a TTL (time-to-live) of one day -cache = cachetools.TTLCache(maxsize=100, ttl=86400) +cache_dir = os.path.expanduser('~/.cache/getfactormodels/aqr/hml_devil') +os.makedirs(cache_dir, exist_ok=True) +cache = dc.Cache(cache_dir) -def _download_hml_devil(frequency: str = 'M') -> pd.DataFrame: - base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/' - file = 'daily' if frequency.lower() == 'd' else 'monthly' - url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' - - print('Downloading HML Devil factors from AQR... This can take a while. Please be patient or something.') # noqa: E501 +def _aqr_download_data(url: str) -> pd.DataFrame: + """Download the data from the given URL.""" + print('Downloading data... This can take a while. Please be patient.') response = requests.get(url, verify=True, timeout=180) xls = pd.ExcelFile(BytesIO(response.content)) + return xls +def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: + """Process the downloaded data.""" sheets = {0: 'HML Devil', 4: 'MKT', 5: 'SMB', 7: 'UMD', 8: 'RF'} dfs = [] @@ -351,53 +353,21 @@ def _download_hml_devil(frequency: str = 'M') -> pd.DataFrame: for sheet_index, sheet_name in sheets.items(): df = df_dict[sheet_name] - df = df[['USA']] if sheet_index != 8 else df.iloc[:, 0:1] - df.columns = [sheet_name] dfs.append(df) - + # Drop NaNs but only RF UMD data = pd.concat(dfs, axis=1) - data.rename(columns={'MKT': 'Mkt-RF', - 'HML Devil': 'HML_DEVIL'}) - + data = data.dropna(subset=['RF', 'UMD']) + data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_DEVIL'}, inplace=True) data = data.astype(float) - return data - - -# TODO: FIXME: HML Devil isn't using cache in cli. see /utils/cli.py probably. MKT -> Mkt-RF also!! Needs to be fixed before a swap-out hml for hm_devil func! # noqa: E501 - -def _get_hml_devil(frequency: str = 'M', - start_date: Optional[str] = None, - end_date: Optional[str] = None, - output: Optional[str] = None, - series: bool = False) -> Union[pd.Series, pd.DataFrame]: - - # Use the current date as a cache key - current_date = datetime.date.today() - cache_key = ('hmld', frequency, None, None, None, None, current_date) - - # Check if the data is in the cache - data = cache.get(cache_key) - if data is not None: - return data - - # If the data is not in the cache, download it - data = _download_hml_devil() - - # Apply transformations to the data - data = data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_DEVIL'}) - data = data.astype(float) - - # Store the transformed data in the cache - cache[cache_key] = data + data = data.dropna() return data -def hml_devil_factors(frequency: str = 'M', - start_date: Optional[str] = None, +def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, end_date: Optional[str] = None, output: Optional[str] = None, series: bool = False) -> Union[pd.Series, pd.DataFrame]: @@ -421,14 +391,32 @@ def hml_devil_factors(frequency: str = 'M', pd.DataFrame: the HML Devil model data indexed by date. pd.Series: the HML factor as a pd.Series """ - data = _get_hml_devil(frequency, start_date, end_date, series=series) - data = data.dropna() + base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/' + file = 'daily' if frequency.lower() == 'd' else 'monthly' + url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' - data = data.rename(columns={'MKT': 'Mkt-RF'}) - data = data.dropna() + # Use the current date as a cache key + current_date = datetime.date.today().strftime('%Y-%m-%d') + cache_key = ('hmld', frequency, None, None, None, None, current_date) - return _process(data, start_date, end_date, filepath=output) + # Check if the data is in the cache + data = cache.get(cache_key, default=None) + if data is not None: + print("Using cached data") + return data + + # If the data is not in the cache, download it + print("Not using cache, downloading data") + xls = _aqr_download_data(url) + + # Process the downloaded data + data = _aqr_process_data(xls) + + # Store the processed data in the cache + cache.set(cache_key, data, expire=86400) # TTL is set here + + return data def barillas_shanken_factors(frequency: str = 'M', diff --git a/getfactormodels/utils/utils.py b/getfactormodels/utils/utils.py index b1bb27a..d52396b 100644 --- a/getfactormodels/utils/utils.py +++ b/getfactormodels/utils/utils.py @@ -20,7 +20,7 @@ "liquidity": r"^(il)?liq(uidity)?|(pastor|ps|sp)$", "icr": r"\bicr|hkm\b", "dhs": r"^(\bdhs\b|behav.*)$", - "hml_devil": r"\bhml(_)?d(evil)?\b", + "hml_devil": r"\bhml(_)?d(evil)?|hmld\b", "barillas_shanken": r"\b(bs|bs6|barillas|shanken)\b", }) From afd752f7ddcf7a6507928c196648bb902584731d Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 05:12:42 +1100 Subject: [PATCH 11/17] fix: ``hml_devil_factors()`` using persistent cache (``diskcache``) --- getfactormodels/__main__.py | 9 +++++---- getfactormodels/models/models.py | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index bd0c58e..d964f9f 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -1,19 +1,20 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- from pathlib import Path +from typing import Optional import pandas as pd from dateutil import parser # ruff: noqa: RUF100 from getfactormodels.models.models import \ barillas_shanken_factors # noqa: F401 -from getfactormodels.models.models import (carhart_factors, dhs_factors, # noqa: F401, E501 - ff_factors, hml_devil_factors, - icr_factors, liquidity_factors, +from getfactormodels.models.models import (carhart_factors, # noqa: F401, E501 + dhs_factors, ff_factors, + hml_devil_factors, icr_factors, + liquidity_factors, mispricing_factors, q_classic_factors, q_factors) from getfactormodels.utils.cli import parse_args from getfactormodels.utils.utils import _get_model_key, _process -from typing import Optional def get_factors(model: str = "3", diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index 4eb50b7..d09159b 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -27,17 +27,16 @@ """ from __future__ import annotations import datetime +import os from io import BytesIO from typing import Optional, Union import diskcache as dc -import os import numpy as np import pandas as pd import requests from getfactormodels.utils.utils import _process, get_file_from_url from .ff_models import _get_ff_factors - # TODO: "PEP 484 prohibits implicit `Optional`" see: RUFF013. From b075b741e23cb5822c9ac243673809700fedc048 Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 05:45:41 +1100 Subject: [PATCH 12/17] fix: cli `--model` with ``hml_devil`` now using cache --- getfactormodels/__main__.py | 13 +++---- getfactormodels/models/models.py | 62 +++++++++++++++++--------------- 2 files changed, 40 insertions(+), 35 deletions(-) diff --git a/getfactormodels/__main__.py b/getfactormodels/__main__.py index d964f9f..6660592 100755 --- a/getfactormodels/__main__.py +++ b/getfactormodels/__main__.py @@ -7,8 +7,8 @@ # ruff: noqa: RUF100 from getfactormodels.models.models import \ barillas_shanken_factors # noqa: F401 -from getfactormodels.models.models import (carhart_factors, # noqa: F401, E501 - dhs_factors, ff_factors, +from getfactormodels.models.models import carhart_factors # noqa: F401, E501 +from getfactormodels.models.models import (dhs_factors, ff_factors, hml_devil_factors, icr_factors, liquidity_factors, mispricing_factors, @@ -99,11 +99,11 @@ def __init__(self, self._no_mkt = False self.df = None - def no_rf(self): + def no_rf(self) -> None: """Sets the _no_rf flag to True.""" self._no_rf = True - def no_mkt(self): + def no_mkt(self) -> None: """Sets the _no_mkt flag to True.""" self._no_mkt = True @@ -118,7 +118,8 @@ def validate_date_format(date_string: str) -> str: try: return parser.parse(date_string).strftime("%Y-%m-%d") except ValueError as err: - raise ValueError("Incorrect date format, use YYYY-MM-DD.") from err + error_message = "Incorrect date format, use YYYY-MM-DD." + raise ValueError(error_message) from err def get_factors(self) -> pd.DataFrame: """Fetch the factor data and store it in the class.""" @@ -130,7 +131,7 @@ def get_factors(self) -> pd.DataFrame: output=self.output) if self._no_rf: - self.df = self.drop_rf(self.df.copy()) # create a copy before drop -- use cache. + self.df = self.drop_rf(self.df.copy()) # create a copy before drop if self._no_mkt: self.df = self.drop_mkt(self.df.copy()) diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index d09159b..8ec5479 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -27,8 +27,8 @@ """ from __future__ import annotations import datetime -import os from io import BytesIO +from pathlib import Path from typing import Optional, Union import diskcache as dc import numpy as np @@ -37,8 +37,6 @@ from getfactormodels.utils.utils import _process, get_file_from_url from .ff_models import _get_ff_factors -# TODO: "PEP 484 prohibits implicit `Optional`" see: RUFF013. - def ff_factors(model: str = "3", frequency: str = "M", @@ -86,8 +84,9 @@ def liquidity_factors(frequency: str = "M", url += '-/media/research/famamiller/data/liq_data_1962_2022.txt' if frequency.lower() != 'm': + err_msg = "Frequency must be 'm'." print('Liquidity factors are only available for monthly frequency.') - raise ValueError("Frequency must be 'm'.") + raise ValueError(err_msg) # Get .csv here... data = get_file_from_url(url) @@ -126,9 +125,9 @@ def mispricing_factors(frequency: str = "M", output: Optional[str] = None) -> pd.DataFrame: """Retrieve the Stambaugh-Yuan mispricing factors. Daily and monthly.""" if frequency.lower() not in ["d", "m"]: - print("Mispricing factors are only available for daily and monthly \ - frequency.") - raise ValueError("Frequency must be 'd' or 'm'.") + error_msg = "Mispricing factors are only available for daily and\ + monthly frequency." + raise ValueError(error_msg) return None file = "M4d" if frequency == "d" else "M4" @@ -216,16 +215,16 @@ def dhs_factors(frequency: str = "M", frequency = frequency.lower() base_url = "https://docs.google.com/spreadsheets/d/" - if frequency.lower() == "m": - file = "1RxYLbCfk19m8fnniiJYfaj3yI55ZPaoi/export?format=xlsx" - elif frequency.lower() == "d": - file = "1KnCP-NVhf2Sni8bVFIVyMxW-vIljBOWE/export?format=xlsx" + if frequency == "m": + sheet = "1RxYLbCfk19m8fnniiJYfaj3yI55ZPaoi/export?format=xlsx" + elif frequency == "d": + sheet = "1KnCP-NVhf2Sni8bVFIVyMxW-vIljBOWE/export?format=xlsx" else: - print("Frequency must be either 'M' (monthly) or 'D' (daily).") - raise ValueError("Frequency must be 'M' or 'D'.") - # TODO: use the link to the Google Sheet instead of the actual sheet. + error_message = "Frequency must be 'm' or 'd' for the DHHS factors'." + print(error_message) + raise ValueError(error_message) - url = base_url + file + url = base_url + sheet response = requests.get(url, verify=True, timeout=20) content = BytesIO(response.content) @@ -262,11 +261,12 @@ def icr_factors(frequency: str = "M", """Retrieve the He, Kelly, Manela (2017) ICR factors. * Daily since 1999-05-03; quarterly and monthly since 1970. """ - # TODO: Do we need Mkt-RF and RF [seen reffered to as 2-factor model]? + # TODO: Do we need Mkt-RF and RF [seen referred to as 2-factor model. Also liq doesnt have mkt-rf or rf]? # noqa frequency = frequency.lower() if frequency not in ["d", "m", "q"]: - raise ValueError("Frequency must be 'd', 'm' or 'q'.") + err_msg = "Frequency must be 'd', 'm' or 'q'." + raise ValueError(err_msg) base_url = "https://voices.uchicago.edu/zhiguohe" file = {"d": "daily", "m": "monthly", "q": "quarterly"}.get(frequency) @@ -327,10 +327,11 @@ def carhart_factors(frequency: str = "M", # =========================== EXPERIMENTAL ================================== # -cache_dir = os.path.expanduser('~/.cache/getfactormodels/aqr/hml_devil') -os.makedirs(cache_dir, exist_ok=True) +cache_dir = Path('~/.cache/getfactormodels/aqr/hml_devil').expanduser() +cache_dir.mkdir(parents=True, exist_ok=True) cache = dc.Cache(cache_dir) + def _aqr_download_data(url: str) -> pd.DataFrame: """Download the data from the given URL.""" print('Downloading data... This can take a while. Please be patient.') @@ -338,6 +339,7 @@ def _aqr_download_data(url: str) -> pd.DataFrame: xls = pd.ExcelFile(BytesIO(response.content)) return xls + def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: """Process the downloaded data.""" sheets = {0: 'HML Devil', 4: 'MKT', 5: 'SMB', 7: 'UMD', 8: 'RF'} @@ -352,16 +354,17 @@ def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: for sheet_index, sheet_name in sheets.items(): df = df_dict[sheet_name] - df = df[['USA']] if sheet_index != 8 else df.iloc[:, 0:1] + df = df[['USA']] if sheet_index != 8 else df.iloc[:, 0:1] # noqa df.columns = [sheet_name] dfs.append(df) - # Drop NaNs but only RF UMD + data = pd.concat(dfs, axis=1) + data = data.dropna(subset=['RF', 'UMD']) - data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_DEVIL'}, inplace=True) - data = data.astype(float) - data = data.dropna() + data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_DEVIL'}) + + data = data.astype(float) return data @@ -395,13 +398,14 @@ def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, file = 'daily' if frequency.lower() == 'd' else 'monthly' url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' - # Use the current date as a cache key + # Use the current date and end date as a cache key current_date = datetime.date.today().strftime('%Y-%m-%d') - cache_key = ('hmld', frequency, None, None, None, None, current_date) + cache_key = ('hmld', frequency, None, None, None, None, current_date, + end_date) # Check if the data is in the cache - data = cache.get(cache_key, default=None) - if data is not None: + data, cached_end_date = cache.get(cache_key, default=(None, None)) + if data is not None and (end_date is None or end_date <= cached_end_date): print("Using cached data") return data @@ -413,7 +417,7 @@ def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, data = _aqr_process_data(xls) # Store the processed data in the cache - cache.set(cache_key, data, expire=86400) # TTL is set here + cache[cache_key] = (data, end_date) # TTL is set here return data From 08d8fd3267e190e1345bbf7ef54993e4e8d5fd04 Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 05:54:14 +1100 Subject: [PATCH 13/17] fix: ``hml_devil`` monthly TypeError --- getfactormodels/models/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index 8ec5479..c4e8b4b 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -362,7 +362,7 @@ def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: data = data.dropna(subset=['RF', 'UMD']) - data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_DEVIL'}) + data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_Devil'}) data = data.astype(float) @@ -449,7 +449,7 @@ def barillas_shanken_factors(frequency: str = 'M', hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date, series=True) - hml_devil = hml_devil.rename('HML_m') + hml_devil = hml_devil.rename(columns={'HML_Devil': 'HML_m'}) hml_devil.index.name = 'date' df = df.merge(hml_devil, left_index=True, From a57134c83fbdfea0cb2b2ab20958500b296f6f73 Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 06:34:20 +1100 Subject: [PATCH 14/17] fix: typo (series not df for bs); confirm ``hml_devil_factors`` using cache --- getfactormodels/models/models.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index c4e8b4b..ae4b689 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -447,12 +447,11 @@ def barillas_shanken_factors(frequency: str = 'M', df = q.merge(ff, left_index=True, right_index=True, how='inner') hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date, - series=True) - - hml_devil = hml_devil.rename(columns={'HML_Devil': 'HML_m'}) + series=True)[['HML Devil']] + hml_devil.index.name = 'date' - df = df.merge(hml_devil, left_index=True, - right_index=True, how='inner') + hml_devil = hml_devil.rename(columns={'HML Devil': 'HML_m'}) + df = df.merge(hml_devil, left_index=True, right_index=True, how='inner') return _process(df, start_date, end_date, filepath=output) From a060c0a80249b926f7473547f384a9a5358bd4d3 Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 06:47:35 +1100 Subject: [PATCH 15/17] prepare version 0.0.4 --- README.md | 2 +- getfactormodels/__init__.py | 2 +- getfactormodels/models/__init__.py | 4 ++-- pyproject.toml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8d76048..9379e9c 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ _Thanks to: Kenneth French, Robert Stambaugh, Lin Sun, Zhiguo He, AQR Capital Ma >[!IMPORTANT] >![PyPI - Status](https://img.shields.io/pypi/status/getfactormodels?style=flat-square) - +> >``getfactormodels`` is new. It was released on December 20, 2023. Don't rely on it for anything. After installation, import and call the ``get_factors()`` function with the ``model`` and ``frequency`` params: diff --git a/getfactormodels/__init__.py b/getfactormodels/__init__.py index f7d785e..a480868 100644 --- a/getfactormodels/__init__.py +++ b/getfactormodels/__init__.py @@ -20,7 +20,7 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -__version__ = "0.0.3" +__version__ = "0.0.4" from .__main__ import FactorExtractor, get_factors from .models import models # noqa: F401, RUF100 (silent flake8 in VScode) diff --git a/getfactormodels/models/__init__.py b/getfactormodels/models/__init__.py index 9be4f54..f314af1 100644 --- a/getfactormodels/models/__init__.py +++ b/getfactormodels/models/__init__.py @@ -20,5 +20,5 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from . import ff_models # noqa: F401 - TODO: disable 401 in all __init__ -from . import models # noqa: F401 +from . import ff_models +from . import models diff --git a/pyproject.toml b/pyproject.toml index 4c85fd5..5c6ed10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "getfactormodels" dynamic = ["version"] description = "Retrieve data for various multifactor asset pricing models." -authors = [{name = "S. Martin", email = "x512@pm.me"}] +authors = [{name = "S. Martin", email = "x512@pm.me"}, ] license = {file = "LICENSE"} readme = "README.md" keywords = ['finance', 'pricing models', 'financial analysis', 'econometrics', From bb8cbcd27339691cb0b639e2f85c0ce05d25b18a Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 07:05:15 +1100 Subject: [PATCH 16/17] prepared: v 0.0.4 --- getfactormodels/models/__init__.py | 3 +-- getfactormodels/models/models.py | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/getfactormodels/models/__init__.py b/getfactormodels/models/__init__.py index f314af1..e12f70d 100644 --- a/getfactormodels/models/__init__.py +++ b/getfactormodels/models/__init__.py @@ -20,5 +20,4 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from . import ff_models -from . import models +from . import ff_models, models diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index ae4b689..a093b83 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -362,7 +362,8 @@ def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: data = data.dropna(subset=['RF', 'UMD']) - data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_Devil'}) + data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_Devil'}, + inplace=True) data = data.astype(float) @@ -447,11 +448,11 @@ def barillas_shanken_factors(frequency: str = 'M', df = q.merge(ff, left_index=True, right_index=True, how='inner') hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date, - series=True)[['HML Devil']] + series=True)[['HML _evil']] hml_devil.index.name = 'date' - hml_devil = hml_devil.rename(columns={'HML Devil': 'HML_m'}) + hml_devil = hml_devil.rename(columns={'HML_Devil': 'HML_m'}) df = df.merge(hml_devil, left_index=True, right_index=True, how='inner') return _process(df, start_date, end_date, filepath=output) From 4202c449bfbfad0c8985fa8ba2ca850d2b4eb36b Mon Sep 17 00:00:00 2001 From: x512 Date: Sun, 24 Dec 2023 07:18:50 +1100 Subject: [PATCH 17/17] prepared v 0.0.4 now --- getfactormodels/models/models.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/getfactormodels/models/models.py b/getfactormodels/models/models.py index a093b83..d3c364d 100644 --- a/getfactormodels/models/models.py +++ b/getfactormodels/models/models.py @@ -220,7 +220,7 @@ def dhs_factors(frequency: str = "M", elif frequency == "d": sheet = "1KnCP-NVhf2Sni8bVFIVyMxW-vIljBOWE/export?format=xlsx" else: - error_message = "Frequency must be 'm' or 'd' for the DHHS factors'." + error_message = "Frequency must be 'm' or 'd' for the DHS factors'." print(error_message) raise ValueError(error_message) @@ -362,9 +362,6 @@ def _aqr_process_data(xls: pd.ExcelFile) -> pd.DataFrame: data = data.dropna(subset=['RF', 'UMD']) - data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_Devil'}, - inplace=True) - data = data.astype(float) return data @@ -394,12 +391,10 @@ def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, pd.DataFrame: the HML Devil model data indexed by date. pd.Series: the HML factor as a pd.Series """ - base_url = 'https://www.aqr.com/-/media/AQR/Documents/Insights/' file = 'daily' if frequency.lower() == 'd' else 'monthly' url = f'{base_url}/Data-Sets/The-Devil-in-HMLs-Details-Factors-{file}.xlsx' - # Use the current date and end date as a cache key current_date = datetime.date.today().strftime('%Y-%m-%d') cache_key = ('hmld', frequency, None, None, None, None, current_date, end_date) @@ -407,15 +402,15 @@ def hml_devil_factors(frequency: str = 'M', start_date: Optional[str] = None, # Check if the data is in the cache data, cached_end_date = cache.get(cache_key, default=(None, None)) if data is not None and (end_date is None or end_date <= cached_end_date): - print("Using cached data") + # Use it if it is and the end date is the same or earlier return data - # If the data is not in the cache, download it - print("Not using cache, downloading data") xls = _aqr_download_data(url) # Process the downloaded data data = _aqr_process_data(xls) + data.rename(columns={'MKT': 'Mkt-RF', 'HML Devil': 'HML_Devil'}, + inplace=True) # Store the processed data in the cache cache[cache_key] = (data, end_date) # TTL is set here @@ -448,11 +443,11 @@ def barillas_shanken_factors(frequency: str = 'M', df = q.merge(ff, left_index=True, right_index=True, how='inner') hml_devil = hml_devil_factors(frequency=frequency, start_date=start_date, - series=True)[['HML _evil']] + series=True)[['HML Devil']] hml_devil.index.name = 'date' - hml_devil = hml_devil.rename(columns={'HML_Devil': 'HML_m'}) + hml_devil = hml_devil.rename(columns={'HML Devil': 'HML_m'}) df = df.merge(hml_devil, left_index=True, right_index=True, how='inner') return _process(df, start_date, end_date, filepath=output)