-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathL_Tagger.py
116 lines (90 loc) · 8.16 KB
/
L_Tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# coding=utf-8
### FAMOUS FIRST WORDS ;-)
# - no names shorter than 3 letters are considered while tagging the articles
# - iff a road name is NOT a substring of another road it is used for tagging the articles
# - a road name is prefered against an administration level name (if a road name was found an administration level name will not be considered)
# - Let's go! ;-)
# IMPORTS
import csv
# LISTS
list_london_roads = []
list_london_adminlevels = []
list_london_articles = []
# FUNCTIONS
def main():
num_not_tagged = 0 # initial value
num_tagged = 0 # initial value
print("Preparing needed lists...\n")
# ROADS
f_roads_london = open('london_roads.csv', 'rb') # open file
for tmp in csv.reader(f_roads_london, delimiter=';'): # save content to list
list_entry = [tmp[0], tmp[1]] # |
list_london_roads.append(list_entry) # |
f_roads_london.close() # close file
print("Filling list with roads = done...\n")
# ADMIN LEVELS
f_admin_london = open('london_admin.csv', 'rb') # open file
for row in csv.reader(f_admin_london): # save content to list
tmp = str(row).split(";") # |
list_entry = [tmp[0].replace('"',''), tmp[1], tmp[2]] # | tmp[0]=name, tmp[1]=level, tmp[2]=geom
list_london_adminlevels.append(list_entry) # |
f_admin_london.close() # close file
print("Filling lists with admin levels = done...\n")
# ARTICLES
f_articles_london = open('london_articles.csv', 'rb') # open file
for tmp in csv.reader(f_articles_london, delimiter='|'): # save content to list
list_entry = [tmp[0], tmp[1], tmp[2].replace('\n',' ').replace('\r',' ')] # | tmp[0]=title, tmp[1]=date, tmp[2]=content
list_london_articles.append(list_entry) # |
f_articles_london.close() # close file
print("Filling lists with articles = done...\n\n")
print("Start tagging articles ...\n")
output = open('london_articles_tagged.txt', 'wb')
writer = csv.writer(output)
isTagged = [False] * len(list_london_articles) # create list with so many 'False' entries as we have articles
for road in list_london_roads:
for id in range(0, len(list_london_articles),1):
article = list_london_articles[id]
roadAlreadyFound = False
if ((road[0] in article[0]) or (road[0] in article[2])) and (len(road[0])>3): # check if road name is in title or content of article
if (not(len(article)<=3)):
for i in range(3,len(article),1):
if (road[0] in article[i]):
roadAlreadyFound = True
if not(roadAlreadyFound):
article.append(road[0]) # tag article with a name (road)
article.append(road[1]) # tag article with a position (road)
writer.writerow(article) # write article to file
isTagged[id] = True # set flag
num_tagged += 1
print(str(num_tagged)+' tagged article with a street name: "' + str(article[0]) + '"')
print("Roads have been tagged ...\n")
for admlvl in list_london_adminlevels:
for id in range(0, len(list_london_articles), 1):
article = list_london_articles[id] # just a helper
if not isTagged[id]: # if article was not already tagged
adminAlreadyFound = False
if ((admlvl[0] in article[0]) or (admlvl[0] in article[2])) and (len(admlvl[0])>3): # contains the article an administrative level name?
if not(len(article)<=3): #
for j in range(3,len(article),1):
if admlvl[0] in article[j]:
adminAlreadyFound = True
if not(adminAlreadyFound):
article.append(admlvl[0]) # tag article with a name (admin level)
article.append(admlvl[2]) # tag article with a position (admin level)
writer.writerow(article) # write article to file
isTagged[id] = True # set flag
num_tagged += 1
print(str(num_tagged)+' tagged article with an administration level name: "' + str(article[0]) + '"')
print("Administration levels have been tagged ...\n")
output.close() # close the file london_articles_tagged.txt
for id in range(0,len(isTagged),1):
if (not isTagged[id]):
print("WARNING: NOT TAGGED --> " + list_london_articles[id][0]) # print title of articles that have not been tagged
num_not_tagged += 1
# TIME TAGGING ?
# - done by using the second attribute "time" of the extracted articles
print("Number of tagged articles: "+str(num_tagged)+" | Number of NOT tagged articles "+str(num_not_tagged)+" .\n")
print("TAGGING COMPLETED!\n")
# start with main()
if __name__ == '__main__':
main()