This repository has been archived by the owner on Dec 29, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
amtrak.py
executable file
·103 lines (85 loc) · 3.48 KB
/
amtrak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
import pandas as pd
from dateutil import parser
#
from amtrakconn import getday, plottrain, plottrains
"""
capabilities:
1) polite scraping of Amtrak Status Reports
2) storing scrapes in same format as site archives (nested zip)
3) statistical analysis using pandas dataframes
TODO:
1) make scraping more polite with keep alive
2) make "hours left to make connection" work more reliably with missing data
EXAMPLE USE (assuming you've already downloading ZIP archives from dixielandsoftware.net)
./amtrak.py 29 7 -d 2013-05-15 2013-05-31
that examines connections between the Capitol Limited to the Empire Builderfor the later half of May 2013
"""
def main(dates, datafn, trains, stop, makeplot, zipfn, doscrape):
delays = {}
actual = {}
days = {}
if datafn is None:
autodate = True
else:
autodate = False
for train in trains:
delays[train] = pd.DataFrame()
actual[train] = pd.DataFrame()
for date in dates:
if autodate:
datafn = date.strftime("%Y") # defaults to nested zip file
pn = date.strftime("%Y/%m/%d")
daydata = getday(datafn, date, train, zipfn, doscrape)
if daydata is not None:
delays[train][pn] = daydata["delayhours"]
actual[train][pn] = daydata["act"]
days[train] = daydata["day"].max()
if daydata is None or daydata["delayhours"].isnull().all():
print("* Note: no data for train " + train + " on " + date.strftime("%Y/%m/%d"))
# if h5fn is not None:
# print('** warning this has not been tested at all')
# tohdf5(h5fn,daydata,date)
plottrain(delays[train], train, dates, stop, makeplot)
plottrains(delays, actual, days, trains, dates, makeplot)
return delays
if __name__ == "__main__":
from argparse import ArgumentParser
p = ArgumentParser(description="Loads Google Forms responses XLS and analyses")
p.add_argument("train", help="train number", type=str, nargs="+")
p.add_argument(
"-d",
"--date",
help="START [STOP] date range of train, format YYYY-MM-DD",
type=str,
nargs="+",
required=True,
)
p.add_argument(
"-f",
"--file",
help="load disk html, txt, zip file (no auto download)",
type=str,
default=None,
)
p.add_argument("-m", "--makeplot", help="plots to make (default all)", type=str, default="all")
p.add_argument("--h5", help="write hdf5 of data to this file", type=str, default=None)
p.add_argument("--zip", help="write zip like the website does", type=str, default=None)
p.add_argument("--scrape", help="scrape from web--use with caution!", action="store_true")
p.add_argument("-s", "--stop", help="three-letter station to debark", type=str, default=None)
ar = p.parse_args()
makeplot = ar.makeplot
if ar.scrape and ar.zip:
doscrape = True
elif ar.scrape and not ar.zip:
doscrape = False
print("disabled scraping since you didnt choose to save as zip (dont be wasteful)")
else:
doscrape = False
if len(ar.date) != 2:
dates = [parser.parse(d) for d in ar.date]
else: # len(ar.date) == 2:
dates = pd.date_range(start=ar.date[0], end=ar.date[1])
if len(dates) == 0:
exit("Your specified date range doesnt make sense")
delay = main(dates, ar.file, ar.train, ar.stop, makeplot, ar.zip, doscrape)