Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/scale dataset #279

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions data/fakedata.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// TODO: this needs a serious refactor. It's a mess. Use the shared library. This is gross.

// import fetch from "node-fetch";
import { JSDOM } from "jsdom";
import * as fs from "fs";

const schoolListFileName = "./schoolList.json";

type SchoolRecord = {
schoolStub: string;
schoolName: string;
gradesLabel: string;
gradeCodes: string[];
address: string;
};

function schoolURLFromStub(stub: string): string {
return `https://www.sfusd.edu${stub}`;
}

// fetch and parse data from the HTML table: https://www.sfusd.edu/schools/directory/table
async function fetchAndParseSFUSD() {
const response = await fetch("https://www.sfusd.edu/schools/directory/table");
const html = await response.text();

const dom = new JSDOM(html);
const document = dom.window.document;

const tableRows = document.querySelectorAll("tr");

const schoolData = Array.from(tableRows)
.map((row) => {
const schoolName =
row.querySelector("td.views-field-title a")?.textContent?.trim() || "";

const schoolStub =
(
row.querySelector("td.views-field-title a") as HTMLAnchorElement
)?.href?.trim() || "";

const [gradesLabel, gradesCodeString] = (
row.querySelector("td.views-field-nothing-1")?.textContent?.trim() || ""
).split(/[\(\)]/, 2);

const address =
row.querySelector("td.views-field-nothing")?.textContent?.trim() || "";

return {
schoolStub,
schoolName,
gradesLabel: gradesLabel.trim(),
gradeCodes: gradesCodeString?.split(",").map((code) => code.trim()),
address: address.replace(/\s+/g, " ").trim(),
};
})
.filter((school) => school.schoolName !== "");

return schoolData as SchoolRecord[];
}

async function readSchoolListFromFile(): Promise<SchoolRecord[] | undefined> {
const buffer = fs.readFileSync(schoolListFileName, { encoding: "utf-8" });
try {
const schoolRecords: SchoolRecord[] = JSON.parse(buffer);
return schoolRecords;
} catch (parseError) {
console.error("error parsing file", parseError);
return undefined;
}
}

async function fetchSchoolDetails() {
const schoolList = await readSchoolListFromFile();
console.log(`${schoolList?.length} schools read from ${schoolListFileName}`);

schoolList?.forEach((school, index, schoolList) => {
console.log(
`fetching school details ${index + 1} of ${schoolList.length}: ${school.schoolName} (${school.schoolStub})`,
);

const schoolURL = schoolURLFromStub(school.schoolStub);
console.log(`fetching ${schoolURL}`);
});
}

// create schoolList.json
// fetchAndParseSFUSD()
// .catch((err) => console.error("Error fetching and parsing data:", err))
// .then((data) => {
// if (data)
// fs.writeFileSync(schoolListFileName, JSON.stringify(data), {
// encoding: "utf-8",
// });
// console.log("done.");
// });

fetchSchoolDetails()
.then(() => {
console.log("done.");
})
.catch((err) => {
console.error("something went wrong!", err);
});
52 changes: 52 additions & 0 deletions data/fetchFullImages.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// fetchFullImages.ts
// Purpose: fetch full-size images for each school in the school list

import {
readSchoolList,
schoolListFilePath,
sleep,
extractExtensionFromUrl,
downloadImage,
writeSchoolList,
} from "./shared";

const dosDelay = 3000; // mercy mode - 3 second delay between fetches (avoid DOS)

async function downloadFullImages() {
if (dosDelay > 100)
console.warn(
`mercy mode enabled - expect roughly ${dosDelay}ms delay between fetches ...`,
);

const schoolList = readSchoolList();

for (const school of schoolList) {
const src = school.image?.src || "";
const ext = extractExtensionFromUrl(src);
const filePath =
ext && ext.length > 1 ? `school_img/${school.schoolStub}.${ext}` : "";

// update the schoolList with the new image path
if (school.image) school.image.filePath = filePath;

if (src.length > 1 && filePath.length > 1) {
await sleep(dosDelay, true);
console.log(`storing ${src} as ${filePath}`);
await downloadImage(src, `public/${filePath}`);
}
}

// rewrite the schoolList file with the new image paths
console.log(`writing results to ${schoolListFilePath}`);
writeSchoolList(schoolList);
}

function main() {
downloadFullImages()
.then(() => {
console.log("done.");
})
.catch(console.error);
}

main();
63 changes: 63 additions & 0 deletions data/fetchLatLong.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// fetchLatLong.ts
// Description: Fetches the latitude and longitude and updates the school list file with the geolocations.
// Depends on the AWS_GEO_KEY environment variable being set to the AWS API key enabled with location services access.

import { readSchoolList, writeSchoolList } from "./shared";

async function awsGeoLocate(address: string, cached: boolean = false) {
try {
const response = await fetch(
`https://places.geo.us-west-2.amazonaws.com/v2/geocode?key=${process.env.AWS_GEO_KEY}${cached ? "&intended-use=Storage" : ""}`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
QueryText: address,
QueryComponents: { Locality: "San Francisco", Country: "US" },
}),
},
);

if (!response.ok) {
throw new Error(`HTTP error! Status: ${response.status}`);
}

const result = await response.json();
const item = result.ResultItems[0];

return {
addressString: item?.Title,
addressDetails: item?.Address,
geo: item?.Position,
geoBounds: item?.MapView,
};
} catch (error) {
console.error("Error posting data:", error);
return null;
}
}

async function awsGeoLocateArray(addresses: string[], cached: boolean = false) {
return await Promise.all(
addresses.map(async (address) => {
return await awsGeoLocate(address, cached);
}),
);
}

function main() {
const schoolList = readSchoolList();
Promise.all(
schoolList.map(async (school) => {
const geolocations = await awsGeoLocateArray(school.locations);
return { ...school, geolocations };
}),
).then((results) => {
writeSchoolList(results);
console.log(JSON.stringify(results, null, 2));
});
}

main();
60 changes: 60 additions & 0 deletions data/fetchLogos.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// document.querySelector(".site-logo>img").getAttribute("src")
import { JSDOM } from "jsdom";

import {
readSchoolList,
SchoolProfile,
writeSchoolList,
downloadImage,
schoolStubFromUrlStub,
SchoolLogo,
extractExtensionFromUrl,
sleep,
} from "./shared";

const dosDelay = 3000; // mercy mode - 3 second delay between fetches (avoid DOS)

async function extractLogoDetails(url: string): Promise<SchoolLogo> {
const response = await fetch(url);
const html = await response.text();

const dom = new JSDOM(html);
const document = dom.window.document;

const logoElement = document.querySelector(".site-logo>img");
const logoUrl: string = logoElement?.getAttribute("src") || "";
const logoAltText: string = logoElement?.getAttribute("alt") || "";
const filePath = `${schoolStubFromUrlStub(url)}.${extractExtensionFromUrl(logoUrl)}`;

return { logoUrl, logoAltText, filePath };
}

// extractLogoDetails("https://www.sfusd.edu/school/ap-giannini-middle-school")
// .then((details) => {
// console.log(JSON.stringify(details, null, 2));
// console.log("done.");
// })
// .catch(console.error);

async function main() {
const schoolList = readSchoolList();
for (const school of schoolList) {
console.log(`extracting logo for ${school.schoolUrl}`);
const logo = await extractLogoDetails(school.schoolUrl);
if (logo.logoUrl !== "") {
Object.assign(school, { logo });
await sleep(dosDelay, true);
await downloadImage(
logo.logoUrl,
`public/school_img/logo/${logo.filePath}`,
);
}
}
writeSchoolList(schoolList);
}

main()
.then(() => {
console.log("done.");
})
.catch(console.error);
51 changes: 51 additions & 0 deletions data/gdriveTest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// an attempt to extract school data directly from the SARCS PDFs
// using the Google Drive API
//
// This test is on-hold since the SFUSD data may be available
// in a more structured format from:
// https://caaspp-elpac.ets.org/caaspp/ResearchFileListSB?ps=true&lstTestYear=2024&lstTestType=B&lstCounty=38&lstDistrict=68478-000&lstFocus=b

import { google } from "googleapis";

async function listFilesInFolder(folderId: string) {
// Initialize client
const auth = new google.auth.GoogleAuth({
keyFile: "data/sfusd-data-secretkey.json", // replace with your JSON key file
scopes: ["https://www.googleapis.com/auth/drive"],
});

const drive = google.drive({ version: "v3", auth });

// grab the file list
const res = await drive.files.list({
q: `'${folderId}' in parents`,
fields: "files(id, name)",
});

const files = res.data.files;
if (files && files.length > 0) {
// grab the right file (hopefully)
const candidateFiles = files.filter(
(file) =>
file.name &&
file.name.toLowerCase().includes("eng") &&
file.name.toLowerCase().includes("23"),
);

candidateFiles.forEach((file) => {
console.log(`${file.name} (${file.id})`);
});
} else {
console.log("No files found.");
}
}

// Call the function with the folder ID
listFilesInFolder("1AwM5P8Pf3JqhqqAy8aRELwyqtdK6ew4W")
.then(() => {
console.log("done");
})
.catch(console.error);

// grab the right file
// Select and Read a File: Once you’ve listed files, you can use their IDs to download or read specific files. The drive.files.get method with alt: 'media' can be used to fetch file content.
Loading