Skip to content

Commit

Permalink
Merge pull request #316 from capitalone/develop
Browse files Browse the repository at this point in the history
Release v0.6.3
  • Loading branch information
fdosani authored Dec 17, 2024
2 parents 29715b9 + 2bd24ce commit ddb33ef
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 39 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/edgetest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ name: Run edgetest
on:
schedule:
- cron: '35 17 * * 5'
workflow_dispatch:
jobs:
edgetest:
runs-on: ubuntu-latest
name: running edgetest
permissions:
contents: write
pull-requests: write
steps:
- uses: actions/checkout@v4
with:
Expand All @@ -19,7 +23,7 @@ jobs:
cp tests/data/.locopyrc ~/.locopyrc
cp tests/data/.locopy-sfrc ~/.locopy-sfrc
- id: run-edgetest
uses: fdosani/run-edgetest-action@v1.3
uses: edgetest-dev/run-edgetest-action@v1.5
with:
edgetest-flags: '-c pyproject.toml --export'
base-branch: 'develop'
Expand Down
2 changes: 1 addition & 1 deletion locopy/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.6.2"
__version__ = "0.6.3"
53 changes: 36 additions & 17 deletions locopy/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import pandas as pd
import polars as pl
import pyarrow as pa
import yaml

from locopy.errors import (
Expand Down Expand Up @@ -317,6 +318,20 @@ def validate_float_object(column):
except (ValueError, TypeError):
return None

def check_column_type_pyarrow(pa_dtype):
if pa.types.is_temporal(pa_dtype):
return "timestamp"
elif pa.types.is_boolean(pa_dtype):
return "boolean"
elif pa.types.is_integer(pa_dtype):
return "int"
elif pa.types.is_floating(pa_dtype):
return "float"
elif pa.types.is_string(pa_dtype):
return "varchar"
else:
return "varchar"

if warehouse_type.lower() not in ["snowflake", "redshift"]:
raise ValueError(
'warehouse_type argument must be either "snowflake" or "redshift"'
Expand All @@ -328,24 +343,28 @@ def validate_float_object(column):
data = dataframe[column].dropna().reset_index(drop=True)
if data.size == 0:
column_type.append("varchar")
elif (data.dtype in ["datetime64[ns]", "M8[ns]"]) or (
re.match(r"(datetime64\[ns\,\W)([a-zA-Z]+)(\])", str(data.dtype))
):
column_type.append("timestamp")
elif str(data.dtype).lower().startswith("bool"):
column_type.append("boolean")
elif str(data.dtype).startswith("object"):
data_type = validate_float_object(data) or validate_date_object(data)
if not data_type:
column_type.append("varchar")
else:
column_type.append(data_type)
elif str(data.dtype).lower().startswith("int"):
column_type.append("int")
elif str(data.dtype).lower().startswith("float"):
column_type.append("float")
elif isinstance(data.dtype, pd.ArrowDtype):
datatype = check_column_type_pyarrow(data.dtype.pyarrow_dtype)
column_type.append(datatype)
else:
column_type.append("varchar")
if (data.dtype in ["datetime64[ns]", "M8[ns]"]) or (
re.match(r"(datetime64\[ns\,\W)([a-zA-Z]+)(\])", str(data.dtype))
):
column_type.append("timestamp")
elif str(data.dtype).lower().startswith("bool"):
column_type.append("boolean")
elif str(data.dtype).startswith("object"):
data_type = validate_float_object(data) or validate_date_object(data)
if not data_type:
column_type.append("varchar")
else:
column_type.append(data_type)
elif str(data.dtype).lower().startswith("int"):
column_type.append("int")
elif str(data.dtype).lower().startswith("float"):
column_type.append("float")
else:
column_type.append("varchar")
logger.info("Parsing column %s to %s", column, column_type[-1])
return OrderedDict(zip(list(dataframe.columns), column_type))

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
{ name="Faisal Dosani", email="faisal.dosani@capitalone.com" },
]
license = {text = "Apache Software License"}
dependencies = ["boto3<=1.35.43,>=1.9.92", "PyYAML<=6.0.1,>=5.1", "pandas<=2.2.3,>=0.25.2", "numpy<=2.0.2,>=1.22.0", "polars>=0.20.0"]
dependencies = ["boto3<=1.35.80,>=1.9.92", "PyYAML<=6.0.2,>=5.1", "pandas<=2.2.3,>=1.5.0", "numpy<=2.2.0,>=1.22.0", "polars>=0.20.0", "pyarrow>=10.0.1"]

requires-python = ">=3.9.0"
classifiers = [
Expand Down Expand Up @@ -104,7 +104,7 @@ ban-relative-imports = "all"
convention = "numpy"

[edgetest.envs.core]
python_version = "3.9"
python_version = "3.10"
extras = [
"tests",
"psycopg2",
Expand Down
33 changes: 15 additions & 18 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,41 +1,38 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile --output-file=requirements.txt pyproject.toml
#

boto3==1.34.126
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml -o requirements.txt
boto3==1.35.80
# via locopy (pyproject.toml)
botocore==1.34.130
botocore==1.35.80
# via
# boto3
# s3transfer
jmespath==1.0.1
# via
# boto3
# botocore
numpy==1.26.4
numpy==2.2.0
# via
# locopy (pyproject.toml)
# pandas
pandas==2.2.2
pandas==2.2.3
# via locopy (pyproject.toml)
polars==1.6.0
polars==1.17.1
# via locopy (pyproject.toml)
pyarrow==18.1.0
# via locopy (pyproject.toml)
python-dateutil==2.9.0.post0
# via
# botocore
# pandas
pytz==2024.1
pytz==2024.2
# via pandas
pyyaml==6.0.1
pyyaml==6.0.2
# via locopy (pyproject.toml)
s3transfer==0.10.1
s3transfer==0.10.4
# via boto3
six==1.16.0
six==1.17.0
# via python-dateutil
tzdata==2024.1
tzdata==2024.2
# via pandas
urllib3==1.26.20
urllib3==2.2.3
# via botocore
42 changes: 42 additions & 0 deletions tests/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from unittest import mock

import locopy.utility as util
import pyarrow as pa
import pytest
from locopy.errors import (
CompressionError,
Expand Down Expand Up @@ -388,7 +389,48 @@ def test_find_column_type_new():
"d": "varchar",
"e": "boolean",
}
assert find_column_type(input_text, "snowflake") == output_text_snowflake
assert find_column_type(input_text, "redshift") == output_text_redshift


def test_find_column_type_pyarrow():
import pandas as pd

input_text = pd.DataFrame.from_dict(
{
"a": [1],
"b": [pd.Timestamp("2017-01-01T12+0")],
"c": [1.2],
"d": ["a"],
"e": [True],
}
)

input_text = input_text.astype(
dtype={
"a": "int64[pyarrow]",
"b": pd.ArrowDtype(pa.timestamp("ns", tz="UTC")),
"c": "float64[pyarrow]",
"d": pd.ArrowDtype(pa.string()),
"e": "bool[pyarrow]",
}
)

output_text_snowflake = {
"a": "int",
"b": "timestamp",
"c": "float",
"d": "varchar",
"e": "boolean",
}

output_text_redshift = {
"a": "int",
"b": "timestamp",
"c": "float",
"d": "varchar",
"e": "boolean",
}
assert find_column_type(input_text, "snowflake") == output_text_snowflake
assert find_column_type(input_text, "redshift") == output_text_redshift

Expand Down

0 comments on commit ddb33ef

Please sign in to comment.