class WikiPage():
def __init__(self, url, wiki_url):
self.url = url
self.wiki_url = wiki_url
self.crawled = self.crawl()
self.title = self.crawled.title.text.replace(" - Wikipedia", "")
def get_beginning_links(self):
self.page_beginning = self.crawled.find_all("div", {"id": "mw-content-text"})[0]
self.beginning_links = []
for i in range(2):
try:
self.beginning_links += self.page_beginning.find_all("p")[i].find_all("a", {'title':True})
except:
continue
self.beginning_links = [self.wiki_url + a.get("href")[1:] for a in self.beginning_links]
return self.beginning_links
def crawl(self):
= self.url
url = requests.get(url)
response = bs4.BeautifulSoup(response.text, "html.parser")
soup return soup
def __str__(self):
return self.title
__repr__ = __str__
Scraper
WikiPage
WikiPage (url, wiki_url)
Initialize self. See help(type(self)) for accurate signature.
class Crawler():
def __init__(self, output_path, lang="en", first_n=10):
self.lang = lang
self.search_url = f"https://{lang}.wikipedia.org/w/index.php?search="
self.wiki_url = f"https://{lang}.wikipedia.org/"
self.pages = {}
self.first_n = first_n
self.output_path = Path(output_path)
if not self.output_path.exists():
self.output_path.mkdir(parents=True, exist_ok=True)
def create_page_from_query(self, query):
= self.search_url + query.replace(" ", "+")
url return self.create_page(url)
def create_page(self, url):
return WikiPage(url, wiki_url=self.wiki_url)
def create_card(self, url, final=False):
= self.create_page(url)
starting_page = {"starting_page": starting_page, "children_pages":{}}
out = 0
count for url in out["starting_page"].get_beginning_links():
try:
if final:
"children_pages"][url] = self.create_page(url)
out[else:
"children_pages"][url] = None
out[except:
continue
+= 1
count if len(out["children_pages"]) >= self.first_n:
break
return out
def create_cards(self, query):
= self.search_url + query.replace(" ", "+")
url = self.create_card(url, final=False)
cards = {"query": str(cards["starting_page"]), "cards":[]}
out for url in cards["children_pages"].keys():
try:
= self.create_card(url, final=True)
new_card except:
continue
"cards"].append({str(new_card["starting_page"]): [str(i) for i in new_card["children_pages"].values()]})
out[return out
def save_cards(self, cards):
with open(self.output_path / f"{cards['query']}.json", "w") as f:
json.dump(cards, f)
Crawler
Crawler (output_path, lang='en', first_n=10)
Initialize self. See help(type(self)) for accurate signature.
Tests
= "ecology"
query = Crawler("./test_data/temp_files", "en")
crawler = crawler.create_page_from_query("football")
page = crawler.create_cards(query)
cards
crawler.save_cards(cards)
+ "football", "https://en.wikipedia.org/w/index.php?search=football")
test_eq(crawler.search_url + "wiki/football", "https://en.wikipedia.org/wiki/football")
test_eq(crawler.wiki_url "https://en.wikipedia.org/w/index.php?search=football")
test_eq(page.url, assert os.path.exists(f"./test_data/temp_files/{query.replace(' ', '_').capitalize()}.json")
# rmtree("./test_data/temp_files")