Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
Asiri Hewage committed Jan 30, 2022
0 parents commit fc02c70
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 0 deletions.
Binary file added 47dcf6e5-0d63-4824-9135-e2b4171a171f.jfif
Binary file not shown.
4 changes: 4 additions & 0 deletions Config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
simplest xpath web scraper
@author: Asiri Hewage
"""
27 changes: 27 additions & 0 deletions Data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
simplest xpath web scraper
@author: Asiri Hewage
"""

data = [
{
"url": "https://www.technology.pitt.edu/blog/zoom10faq",
"xpaths": [
{
"question": '//div[@class="field-item even"]/h2/text()',
"answers": '//div[@class="field-item even"]/p/text()',
"correct_answer": '//div[@class="field-item even"]/p[0]/text()'
}
]
},
{
"url": "https://www.socialsciencespace.com/2020/03/16-answers-to-your-questions-about-teaching-online/",
"xpaths": [
{
"column_name1": "/img",
"column_name2": "/p",
"column_name3": "/a"
}
]
}
]
17 changes: 17 additions & 0 deletions Database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
simplest xpath web scraper
@author: Asiri Hewage
"""

import pymongo


class Database:
def __init__(self):
self.myclient = pymongo.MongoClient("mongodb://localhost:27017/")
self.mydb = self.myclient["mydatabase"]
self.mycol = self.mydb["customers"]

def insert(self, data):
x = self.mycol.insert_one(data)
return x
37 changes: 37 additions & 0 deletions Extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
simplest xpath web scraper
@author: Asiri Hewage
"""
from lxml import html
import requests


class Extractor:
def __init__(self):
self.r = None
self.tree = None
self.xpaths = None

def extract(self, url, xpaths):
"""
:return:
"""
res = []
data = []
ret = {}

self.r = requests.get(url)
self.tree = html.fromstring(self.r.content)

for key, obj in enumerate(xpaths):
for column_name, xpath in obj.items():
value = self.tree.xpath(xpath)
res = {
column_name: value
}
data.append(res)

ret["url"] = url
ret["data"] = data

return ret
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Simplest xpath web scraper
Simples web scraper created using Python3
- extract data using multiple xpaths from multiple urls
- save response in MongoDB
- exceptions and error handling
- only for basic web sraping work from static HTML web pages

## setup Data.py for each url with xpath
```json
{
"url": "https://www.technology.pitt.edu/blog/zoom10faq",
"xpaths": [
{
"questions": '//div[@class="field-item even"]/h2/text()',
"answers": '//div[@class="field-item even"]/p/text()',
"correct_answer": '//div[@class="field-item even"]/p[0]/text()'
}
]
}
```
## setup mongodb database connection string
```python
myclient = pymongo.MongoClient("mongodb://host:port/") # or add the connection url
mydb = myclient["database"]
mycol = mydb["collection"]
```

## install python dependancies
```commandline
pip3 install -r requirements.txt
```

## run
```commandline
python3 main.py
```

## response
![Simplest xpath web scraper](47dcf6e5-0d63-4824-9135-e2b4171a171f.jfif)

### Author : Asiri Hewage
23 changes: 23 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
simplest xpath web scraper
@author: Asiri Hewage
"""
from Extractor import Extractor
from Data import data
# from Database import Database


def run():
try:
extractor = Extractor()
# database = Database()
for obj in data:
res = extractor.extract(obj["url"], obj["xpaths"])
# database.insert(res)
print(res)
except Exception as er:
print(er)


if __name__ == '__main__':
run()
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests~=2.26.0
lxml~=4.6.3
pymongo

0 comments on commit fc02c70

Please sign in to comment.