-
Notifications
You must be signed in to change notification settings - Fork 2
/
taiwan_election_2020.py
138 lines (137 loc) · 7.57 KB
/
taiwan_election_2020.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pandas as pd
from urllib.parse import quote_plus
from string import ascii_uppercase
import re
class TaiwanElection2020:
def __init__(self):
self._counties = ['宜蘭縣', '彰化縣', '金門縣', '桃園市', '苗栗縣', '臺南市', '雲林縣', '南投縣', '高雄市', '臺北市', '新北市', '花蓮縣', '新竹市', '新竹縣', '基隆市', '連江縣', '嘉義縣', '嘉義市', '屏東縣', '澎湖縣', '臺東縣', '臺中市']
def tidy_dataframe(self, df):
# updating columns attributes
n_cols = df.columns.size
n_candidates = n_cols - 11
id_vars = ['town', 'village', 'office']
candidates = list(df.columns[3:(3 + n_candidates)])
office_cols = list(ascii_uppercase[:8])
col_names = id_vars + candidates + office_cols
df.columns = col_names
# forward-fill district values
filled_towns = df['town'].fillna(method='ffill')
df = df.assign(town=filled_towns)
# removing summations
df = df.dropna()
# removing extra spaces
stripped_towns = df['town'].str.replace("\u3000", "")
df = df.assign(town=stripped_towns)
# pivoting
df = df.drop(labels=office_cols, axis=1)
tidy_df = pd.melt(df,
id_vars=id_vars,
var_name='candidate_info',
value_name='votes'
)
return tidy_df
def adjust_presidential(self, df):
# split candidate information into 2 columns
candidate_info_df = df['candidate_info'].str.split("\n", expand=True)
numbers = candidate_info_df[0].str.replace("\(|\)", "")
candidates = candidate_info_df[1].str.cat(candidate_info_df[2], sep="/")
# re-arrange columns
df = df.drop(labels='candidate_info', axis=1)
df['number'] = numbers
df['candidate'] = candidates
df['office'] = df['office'].astype(int)
df = df[['county', 'town', 'village', 'office', 'number', 'candidate', 'votes']]
return df
def generate_presidential(self):
presidential = pd.DataFrame()
for county in self._counties:
file_name = "總統-A05-4-候選人得票數一覽表-各投開票所({}).xls".format(county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/presidential_2020/{}".format(file_url)
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',')
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
presidential = presidential.append(tidy_df)
print("Tidying {}".format(file_name))
presidential = presidential.reset_index(drop=True) # reset index for the appended dataframe
presidential_adjusted = self.adjust_presidential(presidential)
return presidential_adjusted
def adjust_legislative(self, df):
# split candidate information into 2 columns
candidate_info_df = df['candidate_info'].str.split("\n", expand=True)
numbers = candidate_info_df[0].str.replace("\(|\)", "")
candidates = candidate_info_df[1]
parties = candidate_info_df[2]
# re-arrange columns
df = df.drop(labels='candidate_info', axis=1)
df['number'] = numbers
df['candidate'] = candidates
df['party'] = parties
df['office'] = df['office'].astype(int)
df['number'] = df['number'].astype(int)
return df
def generate_regional(self):
regional = pd.DataFrame()
for county in self._counties:
file_name = "區域立委-A05-2-得票數一覽表({}).xls".format(county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url)
xl = pd.ExcelFile(spreadsheet_url)
for sheet in xl.sheet_names:
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',', sheet_name=sheet)
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
tidy_df['electoral_district'] = sheet
regional = regional.append(tidy_df)
print("Tidying {} of {}".format(sheet, file_name))
regional = regional.reset_index(drop=True) # reset index for the appended dataframe
regional_adjusted = self.adjust_legislative(regional)
regional_adjusted = regional_adjusted[['county', 'electoral_district', 'town', 'village', 'office', 'number', 'party', 'candidate', 'votes']]
return regional_adjusted
def generate_indigenous(self):
indigenous = pd.DataFrame()
indigenous_types = ['山地', '平地']
for county in self._counties:
for indigenous_type in indigenous_types:
file_name = "{}立委-A05-4-得票數一覽表({}).xls".format(indigenous_type, county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url)
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',')
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
tidy_df['electoral_district'] = '{}原住民'.format(indigenous_type)
indigenous = indigenous.append(tidy_df)
print("Tidying {}".format(file_name))
indigenous = indigenous.reset_index(drop=True) # reset index for the appended dataframe
indigenous_adjusted = self.adjust_legislative(indigenous)
indigenous_adjusted = indigenous_adjusted[['county', 'electoral_district', 'town', 'village', 'office', 'number', 'party', 'candidate', 'votes']]
return indigenous_adjusted
def generate_legislative_regional(self):
regional = self.generate_regional()
indigenous = self.generate_indigenous()
legislative_regional = pd.concat([regional, indigenous], axis=0)
legislative_regional = legislative_regional.reset_index(drop=True)
return legislative_regional
def generate_legislative_at_large(self):
legislative_at_large = pd.DataFrame()
for county in self._counties:
file_name = "不分區立委-A05-6-得票數一覽表({}).xls".format(county)
file_url = quote_plus(file_name)
spreadsheet_url = "https://taiwan-election-data.s3-ap-northeast-1.amazonaws.com/legislative_2020/{}".format(file_url)
# skip those combined cells
df = pd.read_excel(spreadsheet_url, skiprows=[0, 1, 3, 4], thousands=',')
tidy_df = self.tidy_dataframe(df)
# appending dataframe of each city/county
tidy_df['county'] = county
legislative_at_large = legislative_at_large.append(tidy_df)
print("Tidying {}".format(file_name))
legislative_at_large = legislative_at_large.reset_index(drop=True) # reset index for the appended dataframe
legislative_at_large_adjusted = self.adjust_legislative(legislative_at_large)
legislative_at_large_adjusted = legislative_at_large_adjusted[['county', 'town', 'village', 'office', 'number', 'party', 'votes']]
return legislative_at_large_adjusted