-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAAC_clean_script.py
139 lines (77 loc) · 3.28 KB
/
AAC_clean_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -------------------------------------------------------
# Import dependencies
import pandas as pd
import numpy as np
from datetime import datetime
from path import Path
import hvplot.pandas
import plotly.express as px
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
# Read in Merged DataFrame
filepath = 'Resources/intakes_outcomes.csv'
df = pd.read_csv(filepath)
# Make animal id an index
df = df.set_index("animal_id")
# Convert date_of_birth, intake_date, and outcome_date to datetime
dates_to_convert = ["date_of_birth", "intake_date", "outcome_date"]
def convert_functions(df_datetime):
for i in dates_to_convert:
df_datetime[i] = pd.to_datetime(df_datetime[i])
return df_datetime
print(convert_functions(df))
# Create columns "length_of_stay" to store duration animal stays in the shelter and intake and outcome ages in days
def get_difference(df):
df['length_of_stay'] = (df['outcome_date'] - df['intake_date']).dt.days
df['age_upon_intake_in_days'] = (
df['intake_date'] - df['date_of_birth']).dt.days
df['age_upon_outcome_in_days'] = (
df['outcome_date'] - df['date_of_birth']).dt.days
return df
print(get_difference(df))
# Drop all animals with intake age of "0 years" who are not nursing or neonatal
df = df.drop(df[(df['date_of_birth'] >= df['intake_date'])
& (df['intake_condition'] != 'Nursing')
& (df['intake_condition'] != 'Neonatal')].index)
# # Drop columns
# Drop unnecessary columns
def drop_columns(df, *cols):
if not cols:
return "Column doesn't exist"
return df.drop(columns=list(cols), inplace=True)
drop_columns(df, 'intake_date', 'date_of_birth', 'outcome_date',
'age_upon_outcome', 'age_upon_intake', 'outcome_subtype')
print(df)
# Drop Any Rows with NaN
# Count number of null values in each column
df.isna().sum()
# Drop NaN rows
df.dropna(inplace=True)
# ### Dashboard Data
# Export cleaned data for use on Dashboard
df.to_csv('Resources/dashboard_data.csv', index=False)
# Prepare the dataframe for the Machine Learning by encoding the fields
# categorical_vars = [col for col in df.columns if df[col].dtype=="O"]
categorical_vars = ['animal_type', 'breed', 'color', 'intake_type',
'intake_condition', 'sex_upon_intake', 'sex_upon_outcome']
categorical_vars
# Use one hot encoder method
enc = OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encoded_df = pd.DataFrame(enc.fit_transform(
df[categorical_vars]), index=df.index)
# Add the encoded variable names to the dataframe
encoded_df.columns = enc.get_feature_names(categorical_vars)
# ### Drop Extra Columns Pt 2: Drop columns that were encoded
# Drop columns that were encoded
df.drop(columns=categorical_vars, inplace=True)
# ### Merge Encoded Dataframe with Columns from Original Dataframe
# Join encoded_df with df
ml_df = df.join(encoded_df)
# ### Drop Any Remaining Rows with NaN
# Drop NaN rows
ml_df.dropna(inplace=True)
# ### Export Cleaned Dataframe to CSV for Machine Learning
# Export the clean, encoded data to a new CSV file for running ML models
ml_df.to_csv('Resources/ml.csv', index=False)
# ### **Start removing columns from ml_df that don't influence the ML model**