-
Notifications
You must be signed in to change notification settings - Fork 0
/
ScrapeSubjects.ps1
27 lines (24 loc) · 1.11 KB
/
ScrapeSubjects.ps1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# get list of language pairs
$response = Invoke-WebRequest -Uri "https://browse.dict.cc/"
$optionElements = @($response.ParsedHtml.body.getElementsByTagName("option"))
$languagePairs = $optionElements.value | where { $_.Length -eq 4 } | select -Unique
# scrape subjects for every language pair
$subjects = @{}
foreach ($languagePair in $languagePairs)
{
$response = Invoke-WebRequest -Uri "https://$languagePair.dict.cc/subjects.php"
$table = $response.ParsedHtml.body.getElementsByTagName("table") | where { $_.getAttributeNode('Width').Value -eq '730' }
$rows = $table.getElementsByTagName("tr") | select -Skip 1
$pairs = @{}
foreach ($row in $rows)
{
$cells = $row.getElementsByTagName("td")
$abbreviation = $cells[0].innerText
$description = $cells[1].innerText
$pairs[$abbreviation] = $description
}
$subjects[$languagePair] = $pairs
}
$json = $subjects | ConvertTo-Json
$utf8NoBomEncoding = New-Object System.Text.UTF8Encoding $False
[IO.File]::WriteAllLines((Join-Path $PSScriptRoot "TranslateWithDictCC\Assets\Subjects.json"), $json, $utf8NoBomEncoding)