-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataframe_operations.py
112 lines (85 loc) · 3.14 KB
/
dataframe_operations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def explore_structure(df, return_dict=False):
"""
Explore and summarize the structure of a DataFrame.
Args:
df (pandas.DataFrame): The DataFrame to explore.
return_dict (bool): If True, return a dictionary of DataFrames grouped by data type.
Returns:
dict: A dictionary of DataFrames grouped by data type if return_dict is True, else None.
"""
hr = '-' * 42
df_dict = {}
# Print the shape of the DataFrame
print('Shape:', df.shape)
# Get value counts of data types
dtypes_vc = df.dtypes.value_counts()
print('Data types')
print(dtypes_vc.to_frame())
print(hr)
# Iterate through each data type
for dt in dtypes_vc.index.astype('str'):
cols = df.select_dtypes(include=[dt]).columns
# If return_dict is True, store columns of this data type in the dictionary
if return_dict:
df_dict[dt] = df[cols]
# For float data types, print summary statistics
if 'float' in dt:
print(f'{dt}-type columns')
df_stats = summary_statistics(df[cols])
try_display(df_stats.round(4))
print(hr)
else:
# For non-float data types, print unique value counts and plot
print(f'{dt}-type columns')
for c in cols:
# Get the column index
i = df.columns.get_loc(c)
# Count unique values in the column
len_uv = len(df[c].unique())
# Calculate value counts for the column
vc = df[c].value_counts()
print()
print(f'\'{c}\' (column {i+1}) has {len_uv} unique values.')
# Check if the column might be an ID column
if len_uv == df.shape[0]:
print(f'\'{c}\' might be an ID column.')
# Check if all unique values have the same frequency
elif vc.std() == 0:
print(f'Each unique value in \'{c}\' appears {vc.mean():.0f} times.')
# If neither condition is met, plot the distribution of value counts
else:
kdeplot(vc, c)
print(hr)
# Return the dictionary if return_dict is True
if return_dict:
return df_dict
def try_display(df):
try:
display(df)
except:
print(df)
def summary_statistics(df, stats=['min', 'max', 'mean', 'median', 'std']):
"""
Calculate summary statistics for a DataFrame.
Args:
df (pandas.DataFrame): The DataFrame to summarize.
stats (list): List of statistics to calculate.
Returns:
pandas.DataFrame: A DataFrame containing the calculated statistics.
"""
return df.agg(stats)
def kdeplot(series, title):
"""
Create a kernel density estimation plot for a given series.
Args:
series (pandas.Series): The series to plot.
title (str): The title for the plot.
"""
sns.set_theme(style='darkgrid')
ax = sns.kdeplot(series, color='green')
ax.set_title(title)