Skip to content

Commit

Permalink
hotfix: okky crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
in-seo committed Jul 24, 2024
1 parent 7ef4380 commit ea3cf3b
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions SouP/src/main/java/Matching/SouP/crawler/okky/OkkyService.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public void getOkkyPostData() {
String html = driver.getPageSource();
Document doc = Jsoup.parse(html);
for (int i = 20; i > 0; i--) { //오래된 글부터 크롤링 그럼 반드시 최신글은 DB에서 가장 밑에꺼임.
Elements element = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(6) > div > ul > li.py-4:nth-child(" + i + ")");
Elements element = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li.py-4:nth-child(" + i + ")");
Elements title = element.select("div > div.my-2 > a");
String postName = title.text();
String num;
Expand All @@ -51,7 +51,7 @@ public void getOkkyPostData() {

String link = "https://okky.kr/articles/"+num;
Document realPost = click(driver, link);
String content = realPost.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(3) > div:nth-child(2) > div:nth-child(3) > div > div > div").text();
String content = realPost.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(2) > div:nth-child(2) > div:nth-child(3) > div > div > div").text();
System.out.println(content);
StringBuilder stack = CrawlerService.parseStack(postName,content);
String talk = "";
Expand Down Expand Up @@ -99,12 +99,15 @@ private int startPage(WebDriver driver, int start) throws StringIndexOutOfBounds
*/
int cnt = 1;
while(true){
if (page > 5) {
throw new IllegalStateException("오키 파싱 에러");
}
driver.get(urlOkky + "?page=" + page);
String html = driver.getPageSource();
Document doc = Jsoup.parse(html);
int num = Integer.MAX_VALUE;
try {
String href = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(6) > div > ul > li:nth-child(" + cnt + ") > div > div.my-2 > a")
String href = doc.select("#__next > main > div > div:nth-child(2) > div > div:nth-child(5) > div > ul > li:nth-child(" + cnt + ") > div > div.my-2 > a")
.attr("href");
String sNum = href.substring(10, href.lastIndexOf('?'));
num = Integer.parseInt(sNum);
Expand All @@ -119,6 +122,7 @@ private int startPage(WebDriver driver, int start) throws StringIndexOutOfBounds
}
cnt=1;
page++;

}
}

Expand Down

0 comments on commit ea3cf3b

Please sign in to comment.