-
Notifications
You must be signed in to change notification settings - Fork 2
/
wiki_extract.rb
131 lines (105 loc) · 2.79 KB
/
wiki_extract.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env ruby
require 'net/http'
require 'nokogiri'
require 'html2markdown'
class HTMLPage
def parse_li node, contents
"* #{contents}"
end
end
module WikiExtract
def self.run(site)
html = all_pages(site)
links = links_from_html(html)
extract_and_convert_pages(site, links)
copy_index
end
private
def self.all_pages(site)
http_get(site, '/wiki/Special:AllPages')
end
def self.links_from_html(html)
doc = html_doc(html)
doc.xpath('//table[@class = "mw-allpages-table-chunk"]//tr/td/a').collect { |n| n['href'] }
end
def self.extract_and_convert_pages(site, links)
links.each { |l| extract_and_convert_page(site, l) }
end
def self.extract_and_convert_page(site, link)
html = http_get(site, link)
doc = html_doc(html)
title = title_from_doc(doc)
first_heading = first_heading_from_doc(doc)
content = content_doc(doc)
new_content = html(remove_toc(remove_comments(rewrite_links(first_heading, content))))
page = build_page(title, first_heading, new_content)
write_markdown_page(filename(first_heading), page)
end
def self.title_from_doc(doc)
doc.at_xpath('//head/title').text
end
def self.first_heading_from_doc(doc)
doc.at_xpath('//h1[@id = "firstHeading"]/span').text
end
def self.content_doc(doc)
doc.at_xpath('//div[@id = "mw-content-text"]')
end
def self.rewrite_links(first_heading, doc)
doc.xpath('.//a').each { |a| a['href'] = rewrite_link(a) if rewritable_link?(a) }
doc
end
def self.rewrite_link(link)
newlink = link['href']
newlink.gsub!(/\/wiki\//, '')
newlink.gsub!(' ', '_')
newlink.gsub!('/', '-')
"#{newlink.downcase}"
end
def self.remove_comments(doc)
doc.xpath('//comment()').remove
doc
end
def self.remove_toc(doc)
doc.xpath('//table[@id = "toc"]').remove
doc
end
def self.rewritable_link?(link)
link['href'].match(/^\/wiki\//)
end
def self.html(doc)
doc.inner_html
end
def self.filename(heading)
name = heading.gsub(' ', '_')
name = name.gsub('/', '-')
"#{name.downcase}.md"
end
def self.build_page(title, first_heading, content)
page = ""
page << "---\n"
page << "layout: default\n"
page << "title: #{title}\n"
page << "---\n\n"
page << "# #{first_heading}\n\n"
page << markdown(content)
page
end
def self.write_markdown_page(filename, page)
File.open(filename, 'w') { |f| f.write(page) }
end
def self.markdown(html)
page = HTMLPage.new(:contents => html)
page.markdown
end
def self.http_get(site, page)
Net::HTTP.get(site, page)
end
def self.html_doc(html)
Nokogiri::HTML(html)
end
def self.copy_index
File.open('index.md', 'w') {|f| f.write(File.read("main_page.md")) }
end
end
site = ARGV[0]
WikiExtract.run(site)