-
Notifications
You must be signed in to change notification settings - Fork 0
/
Notebook_1_b_onlyTempCleaning.py
386 lines (308 loc) · 16.8 KB
/
Notebook_1_b_onlyTempCleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# %% [markdown]
# # MASTER - Notebook 1 - Only temporal Cleaning
# ### Matteo Grazioso 884055
# %%
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')
import myfunctions as mf # Custom functions
# %%
# Disply all columns and all rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# %%
# Find all txt files in the data folder
txt_files = mf.find_txt_files("data/raw")
# Choose a dataset from the list of txt files
selected_dataset = mf.choose_dataset(txt_files)
if selected_dataset:
print(f"You selected the dataset {selected_dataset}")
else:
print("No dataset selected.")
path = selected_dataset
# %%
# The file contain the data of the validation of tickets in the city of public transport of Venice.
# Import the data into a dataframe of a txt file
# path = 'data/raw/1-validazioni.txt' # Period: 2022-05-13 to 2022-07-15
# path = 'data/raw/2-esportazioneCompleta.txt' # Period: 2023-01-23 to 2023-03-14
# path = 'data/raw/3-esportazionePasqua23.txt' # Period: 2023-04-04 to 2023-06-03
df = pd.read_csv(path, header=0, sep='\t')
# Save the name of the file in a variable for future use extracting the name of the file from the path
file_name = path.split('/')[-1].split('.')[0]
# Remove the number and the - character from the head of the file name
file_name = file_name [file_name.find('-')+1:]
# %%
file_name
# %%
# Check the first 5 rows of the data
df.head()
# %%
# Check the last 5 rows of the data
df.tail()
# %%
# Create a subset of the data with the first 10% of the rows and the last 10% of the rows
# df = df.iloc[:int(len(df)*0.01),:]
# df = df.append(df.iloc[-int(len(df)*0.1):,:])
# %% [markdown]
# ## Explorative Data Analysis
#
# %%
# Dates and hour of the validation of the ticket are in the same column 'DATA_VALIDAZIONE'
# Split the column 'DATA_VALIDAZIONE' into two columns 'DATA' and 'ORA' and convert them to datetime format
df.insert(0, 'DATA', pd.to_datetime(df['DATA_VALIDAZIONE'].str.split(' ').str[0], format='%d/%m/%Y'))
df.insert(1, 'ORA', pd.to_datetime(df['DATA_VALIDAZIONE'].str.split(' ').str[1], format='%H:%M').dt.time)
# Drop the column 'DATA_VALIDAZIONE'
# df.drop('DATA_VALIDAZIONE', axis=1, inplace=True)
# Display the first 5 rows of the dataframe
df.head()
# %%
# Set the format of the timestamp
df['DATA_VALIDAZIONE'] = pd.to_datetime(df['DATA_VALIDAZIONE'], format='%d/%m/%Y %H:%M')
# %%
# Print the date of the first and last validation using both data and hour
print('First validation: ', df['DATA'].min(), df['ORA'].min())
print('Last validation: ', df['DATA'].max(), df['ORA'].max())
# Print the number of Serial numbers
print('Number of Serial numbers: ', df['SERIALE'].nunique())
# Print the number of validation (rows)
print('Number of validation: ', df.shape[0])
# Print the number of tickets
print('Number of tickets: ', df['DESCRIZIONE_TITOLO'].nunique())
# Print the number of titolo
print('Number of titolo: ', df['TITOLO'].nunique())
# TODO: why the number of unique TITOLO is different from the number of DESCRIZIONE_TITOLO?
# Print the number of FERMATA
print('Number of FERMATA: ', df['FERMATA'].nunique())
# Print the number of DESCRIZIONE
print('Number of DESCRIZIONE: ', df['DESCRIZIONE'].nunique())
# TODO: why the number of unique DESCRIZIONE is different from the number of FERMATA?
# %%
# Which is the most used ticket?
df['DESCRIZIONE_TITOLO'].value_counts().head(10)
# %%
# Which is the most frequent validation in date and hour?
# Date and hour are in two different columns; DATA_VALIDAZIONE does not exist anymore
df.groupby(['DATA', 'ORA'])['SERIALE'].count().sort_values(ascending=False).head(10)
# TODO: #4 Re-aswer the question of the most frequent validation after cleaning operations
# %%
# Which is the most frequent FERMATA?
df['DESCRIZIONE'].value_counts().head(10)
# TODO: #4 Re-aswer the question of the most frequent FERMATA after cleaning operations
# %% [markdown]
# ## Categories
# %% [markdown]
# The column TICKET_CODE will be filled with the code of the ticket profile according to the ticket type and the ticket validity as follows:
#
# **1.** One-day ticket
#
# **2.** Two-day ticket
#
# **3.** Three-day ticket
#
# **4.** Weekly ticket (Seven-day ticket)
#
# **5.** Monthly ticket
#
# **5-STUD.** Monthly ticket for students
#
# **5-RET.** Monthly ticket for retirees
#
# **5-WKRS.** Monthly ticket for workers
#
# **6.** Annual ticket
#
# **6-STUD.** Annual ticket for students
#
# **6-RET.** Annual ticket for retirees
#
# **6-WKRS.** Annual ticket for workers
#
# **7.** 75 minutes ticket
#
# **8.** Other ticket (if it is necessary to add other types of tickets)
# %%
def assignTicketCode(df_a: pd.DataFrame):
"""
This function assigns a ticket code to each row of the dataframe.
:param df: the dataframe
:return: the dataframe with the new column TICKET_CODE
"""
# Add a new column with the code profile of the ticket
df_a.insert(7, "TICKET_CODE", 'TBD')
# Define the dictionary of ticket codes
dict_tickets = {'1': 'One-day ticket', '2': 'Two-day ticket', '3': 'Three-day ticket',
'4': 'Seven-day ticket',
'5': 'Monthly ticket', '5-STUD': 'Monthly ticket for students',
'5-RET': 'Monthly ticket for retired', '5-WKRS': 'Monthly ticket for workers',
'6': 'Annual ticket', '6-STUD': 'Annual ticket for students', '6-RET': 'Annual ticket for retired',
'6-WKRS': 'Annual ticket for workers',
'7': '75 minutes ticket', '8': 'Other ticket'}
# Convert the column 'DESCRIZIONE_TITOLO' into upper case
df_a['DESCRIZIONE_TITOLO'] = df_a['DESCRIZIONE_TITOLO'].str.upper()
# One-day ticket
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('GIORNALIERO|24H|24ORE|24 ORE|DAILY'), 'TICKET_CODE'] = '1'
# Two-day ticket
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('48H|48ORE|48 ORE'), 'TICKET_CODE'] = '2'
# Three-day ticket
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('72H|72ORE|72 ORE'), 'TICKET_CODE'] = '3'
# Seven-day ticket
df_a[df_a['DESCRIZIONE_TITOLO'].str.contains('7GG|7DAYS|7 DAYS')]['DESCRIZIONE_TITOLO'].value_counts()
# Monthly ticket
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('MENSILE|30GG|30 GG|MENS'), 'TICKET_CODE'] = '5'
## Monthly ticket for students
df_a.loc[(df_a['TICKET_CODE'] == '5') & (df_a['DESCRIZIONE_TITOLO'].str.contains('STUDENTE|STUD')), 'TICKET_CODE'] = '5-STUD'
## Monthly ticket for retired
df_a.loc[(df_a['TICKET_CODE'] == '5') & (df_a['DESCRIZIONE_TITOLO'].str.contains('OVER 65|65+|PENSIONATI')), 'TICKET_CODE'] = '5-RET'
## Monthly ticket for workers
df_a.loc[(df_a['TICKET_CODE'] == '5') & (df_a['DESCRIZIONE_TITOLO'].str.contains('LAVORATORE|LAV')), 'TICKET_CODE'] = '5-WKRS'
## DDRG 1201-1297/2022
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('DDGR1201-1297/2022'), 'TICKET_CODE'] = '5'
# Yearly ticket
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('ANNUALE|ANN|12MESI|12 MESI'), 'TICKET_CODE'] = '6'
## Yearly ticket for students
df_a.loc[(df_a['TICKET_CODE'] == '6') & (df_a['DESCRIZIONE_TITOLO'].str.contains('STUDENTE|STUD|STUD')), 'TICKET_CODE'] = '6-STUD'
## Yearly ticket for retired
df_a.loc[(df_a['TICKET_CODE'] == '6') & (df_a['DESCRIZIONE_TITOLO'].str.contains('OVER 65|65+|PENSIONATI')), 'TICKET_CODE'] = '6-RET'
## Yearly ticket for workers
df_a.loc[(df_a['TICKET_CODE'] == '6') & (df_a['DESCRIZIONE_TITOLO'].str.contains('LAVORATORE|LAV|LAV')), 'TICKET_CODE'] = '6-WKRS'
## Student yearly ticket
df_a.loc[(df_a['DESCRIZIONE_TITOLO'].str.contains('STUDENTE|STUD|STUD')) & ~ (df_a['TICKET_CODE'].isin(['5-STUD', '6-STUD'])), 'TICKET_CODE'] = '6-STUD'
# 75 minutes ticket
df_a.loc[df_a['DESCRIZIONE_TITOLO'].str.contains('75\'|75MIN|75 MIN'), 'TICKET_CODE'] = '7'
# Other ticket
df_a.loc[~df_a['TICKET_CODE'].isin(['1','2','3','4','5','5-STUD','5-WKRS','5-RET','6','6-STUD','6-WKRS','6-RET','7']), 'TICKET_CODE'] = '8'
# Plot a pie chart of the column 'TICKET_CODE'
fig, ax = plt.subplots(figsize=(20,10))
df_a['TICKET_CODE'].value_counts().sort_index().plot.pie(startangle=90)
# Add the name of the ticket profile on the pie chart
plt.legend(labels=df_a['TICKET_CODE'].value_counts().sort_index().rename(dict_tickets).index, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1), fontsize=15)
plt.title('Pie chart of the column TICKET_CODE', fontsize=20)
plt.ylabel('')
fig.patch.set_facecolor('white')
plt.show()
# Delete stamps with ticket code 8
df_a = df_a[df_a['TICKET_CODE'] != '8']
return df_a
# %%
df = assignTicketCode(df)
df.head()
# %% [markdown]
# ## Data Cleaning
# %% [markdown]
# ### Useless stamps
# %%
# Reset the index of the df and drop the old index in order to have a new index starting from 0 to the number of rows
# It is necessary to have a new index because the groupby function has created a multi-index
df.reset_index(drop=True, inplace=True)
# %%
# Create a new column 'MIN_TEMPORAL_GAP' that contains the minimum temporal gap between two validations for the same serial and fermata in minutes
df = df.groupby(['SERIALE','DATA', 'DESCRIZIONE']).apply(lambda x: x.assign(MIN_TEMPORAL_GAP = x['DATA_VALIDAZIONE'].diff().dt.total_seconds()/60))
# %%
df.head(20)
# %%
df.tail(20)
# %%
df['MIN_TEMPORAL_GAP'].value_counts()
# %%
# How many rows have a minimum temporal gap equal to NaN?
df[df['MIN_TEMPORAL_GAP'].isna()].shape[0]
# %%
# Prepare a file to save the information about the results of the cleaning process
# File txt with name: "cleaningResults + filename + date.txt"
import datetime
date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
with open('data/processed/cleaningResults_' + file_name + '_' + date + '.txt', 'w') as f:
f.write('File name: ' + file_name + '\n')
f.write('Operation starts at: ' + date + '\n')
# %%
# Cleaning operation: remove the rows using the minimum temporal gap
# Find a reasonable delta of MIN_TEMPORAL_GAP to remove the rows that have a minimum temporal gap for the same serial and fermata less than this delta
with open('data/processed/cleaningResults_' + file_name + '_' + date + '.txt', 'a') as f:
# Print the minimum value of the column MIN_TEMPORAL_GAP
print('The minimum value of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].min()))
f.write('The minimum value of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].min()))
# Print the maximum value of the column MIN_TEMPORAL_GAP
print('The maximum value of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].max()))
f.write('The maximum value of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].max()))
# Print the mean value of the column MIN_TEMPORAL_GAP
print('The mean value of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].mean()))
f.write('The mean value of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].mean()))
# Print the median value of the column MIN_TEMPORAL_GAP
print('The median value of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].median()))
f.write('The median value of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].median()))
# Print the standard deviation of the column MIN_TEMPORAL_GAP
print('The standard deviation of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].std()))
f.write('The standard deviation of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].std()))
# Print the 0.05th percentile of the column MIN_TEMPORAL_GAP
print('The 0.05th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.05)))
f.write('The 0.05th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.05)))
# Print the 0.10th percentile of the column MIN_TEMPORAL_GAP
print('The 0.10th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.10)))
f.write('The 0.10th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.10)))
# Print the 25th percentile of the column MIN_TEMPORAL_GAP
print('The 25th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.25)))
f.write('The 25th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.25)))
# Print the 75th percentile of the column MIN_TEMPORAL_GAP
print('The 75th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.75)))
f.write('The 75th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.75)))
# Print the 90th percentile of the column MIN_TEMPORAL_GAP
print('The 90th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.90)))
f.write('The 90th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.90)))
# Print the 95th percentile of the column MIN_TEMPORAL_GAP
print('The 95th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.95)))
f.write('The 95th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.95)))
# Print the 99th percentile of the column MIN_TEMPORAL_GAP
print('The 99th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.99)))
f.write('The 99th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.99)))
# Print the 99.9th percentile of the column MIN_TEMPORAL_GAP
print('The 99.9th percentile of the column MIN_TEMPORAL_GAP is: {}'.format(df['MIN_TEMPORAL_GAP'].quantile(0.999)))
f.write('The 99.9th percentile of the column MIN_TEMPORAL_GAP is: {}\n'.format(df['MIN_TEMPORAL_GAP'].quantile(0.999)))
# Decide the delta of MIN_TEMPORAL_GAP to remove the rows that have a minimum temporal gap for the same serial and fermata less than this delta
delta = df['MIN_TEMPORAL_GAP'].quantile(0.1)
if delta == 0:
delta = df['MIN_TEMPORAL_GAP'].quantile(0.25)
if delta == 0:
delta = df['MIN_TEMPORAL_GAP'].median()
print('The delta of MIN_TEMPORAL_GAP is: {}'.format(delta))
f.write('The delta of MIN_TEMPORAL_GAP is: {}\n'.format(delta))
# %%
# Cleaning operation: remove the rows using the minimum temporal gap
# Save the number of rows before the cleaning operation
shape_before = df.shape[0]
# Delete the rows that have a minimum temporal gap for the same serial and fermata more than the delta calculated before.
# Do not remove the rows with NaN values because they are the first validations of the day of a specific serial and fermata usefull for the analysis
df = df[(df['MIN_TEMPORAL_GAP'] > delta) | (df['MIN_TEMPORAL_GAP'].isna())]
with open('data/processed/cleaningResults_' + file_name + '_' + date + '.txt', 'a') as f:
# Print the number of rows before and after the cleaning operation and the difference
print('The number of rows before the cleaning operation is: {}'.format(shape_before))
f.write('The number of rows before the cleaning operation is: {}\n'.format(shape_before))
print('The number of rows after the cleaning operation is: {}'.format(df.shape[0]))
f.write('The number of rows after the cleaning operation is: {}\n'.format(df.shape[0]))
print('The difference is: {}'.format(shape_before - df.shape[0]))
f.write('The difference is: {}\n'.format(shape_before - df.shape[0]))
# Calculate the percentage of rows that has just been deleted
print('The percentage of rows that has just been deleted is: {}%'.format(round((shape_before - df.shape[0])/shape_before*100, 2)))
f.write('The percentage of rows that has just been deleted is: {}%\n'.format(round((shape_before - df.shape[0])/shape_before*100, 2)))
# %%
# Delete the column MIN_TEMPORAL_GAP because it is not useful anymore
df.drop('MIN_TEMPORAL_GAP', axis=1, inplace=True)
# %%
# Create a new dataframe, copied from the original one
df_new = df.copy()
# Print the head of the new dataframe
print(df_new.head())
# Export the new dataframe in a txt file
# The name of the file is dataset_cleaned followed by the name (file_name variable) of the file that has been cleaned with txt extension
name_file = 'dataset_cleaned_temp' + file_name.split('.')[0] + '.txt'
df_new.to_csv('data/processed/' + name_file, sep='\t', index=False)
print('The script has finished')
with open('data/processed/cleaningResults_' + file_name + '_' + date + '.txt', 'a') as f:
f.write('The script has finished\n')
f.write('The name of the file is: ' + name_file + '\n')
date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
f.write('Operation completed at: ' + date + '\n')