Skip to content

Commit

Permalink
Merge pull request #278 from capitalone/develop
Browse files Browse the repository at this point in the history
v0.5.9
  • Loading branch information
gladysteh99 authored Jun 20, 2024
2 parents 90c8c8a + e280467 commit d6d098a
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 20 deletions.
2 changes: 1 addition & 1 deletion locopy/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.5.8"
__version__ = "0.5.9"
18 changes: 9 additions & 9 deletions locopy/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,14 +251,14 @@ def find_column_type(dataframe, warehouse_type: str):
Following is the list of pandas data types that the function checks and their mapping in sql:
- bool -> boolean
- datetime64[ns] -> timestamp
- bool/pd.BooleanDtype -> boolean
- datetime64[ns, <tz>] -> timestamp
- M8[ns] -> timestamp
- int -> int
- float -> float
- int/pd.Int64Dtype -> int
- float/pd.Float64Dtype -> float
- float object -> float
- datetime object -> timestamp
- object -> varchar
- object/pd.StringDtype -> varchar
For all other data types, the column will be mapped to varchar type.
Expand Down Expand Up @@ -313,19 +313,19 @@ def validate_float_object(column):
data = dataframe[column].dropna().reset_index(drop=True)
if data.size == 0:
column_type.append("varchar")
elif data.dtype in ["datetime64[ns]", "M8[ns]"]:
elif (data.dtype in ["datetime64[ns]", "M8[ns]"]) or (re.match("(datetime64\[ns\,\W)([a-zA-Z]+)(\])",str(data.dtype))):
column_type.append("timestamp")
elif data.dtype == "bool":
elif str(data.dtype).lower().startswith("bool"):
column_type.append("boolean")
elif str(data.dtype).startswith("object"):
data_type = validate_float_object(data) or validate_date_object(data)
if not data_type:
column_type.append("varchar")
else:
column_type.append(data_type)
elif str(data.dtype).startswith("int"):
elif str(data.dtype).lower().startswith("int"):
column_type.append("int")
elif str(data.dtype).startswith("float"):
elif str(data.dtype).lower().startswith("float"):
column_type.append("float")
else:
column_type.append("varchar")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = [
{ name="Faisal Dosani", email="faisal.dosani@capitalone.com" },
]
license = {text = "Apache Software License"}
dependencies = ["boto3<=1.34.83,>=1.9.92", "PyYAML<=6.0.1,>=5.1", "pandas<=2.2.2,>=0.25.2", "numpy<=1.26.4,>=1.22.0"]
dependencies = ["boto3<=1.34.126,>=1.9.92", "PyYAML<=6.0.1,>=5.1", "pandas<=2.2.2,>=0.25.2", "numpy<=1.26.4,>=1.22.0"]

requires-python = ">=3.8.0"
classifiers = [
Expand Down
19 changes: 10 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,36 @@
#
# pip-compile --output-file=requirements.txt pyproject.toml
#
boto3==1.28.63

boto3==1.34.126
# via locopy (pyproject.toml)
botocore==1.31.67
botocore==1.34.130
# via
# boto3
# s3transfer
jmespath==1.0.1
# via
# boto3
# botocore
numpy==1.26.0
numpy==1.26.4
# via
# locopy (pyproject.toml)
# pandas
pandas==2.1.1
pandas==2.2.2
# via locopy (pyproject.toml)
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
# via
# botocore
# pandas
pytz==2023.3.post1
pytz==2024.1
# via pandas
pyyaml==6.0.1
# via locopy (pyproject.toml)
s3transfer==0.7.0
s3transfer==0.10.1
# via boto3
six==1.16.0
# via python-dateutil
tzdata==2023.3
tzdata==2024.1
# via pandas
urllib3==2.0.7
urllib3==2.2.2
# via botocore
47 changes: 47 additions & 0 deletions tests/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from itertools import cycle
from pathlib import Path
from unittest import mock
import datetime

import pytest

Expand Down Expand Up @@ -340,6 +341,52 @@ def test_find_column_type():
assert find_column_type(input_text, "snowflake") == output_text_snowflake
assert find_column_type(input_text, "redshift") == output_text_redshift

def test_find_column_type_new():

from decimal import Decimal

import pandas as pd

input_text = pd.DataFrame.from_dict(
{
"a": [1],
"b": [pd.Timestamp('2017-01-01T12+0')],
"c": [1.2],
"d": ["a"],
"e": [True]
}
)

input_text = input_text.astype(
dtype={
"a": pd.Int64Dtype(),
"b": pd.DatetimeTZDtype(tz=datetime.timezone.utc),
"c": pd.Float64Dtype(),
"d": pd.StringDtype(),
"e": pd.BooleanDtype()
}
)

output_text_snowflake = {
"a": "int",
"b": "timestamp",
"c": "float",
"d": "varchar",
"e": "boolean",
}

output_text_redshift = {
"a": "int",
"b": "timestamp",
"c": "float",
"d": "varchar",
"e": "boolean",
}

assert find_column_type(input_text, "snowflake") == output_text_snowflake
assert find_column_type(input_text, "redshift") == output_text_redshift



def test_get_ignoreheader_number():
assert (
Expand Down

0 comments on commit d6d098a

Please sign in to comment.