Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import datetime as dt
- import requests, pandas as pd, matplotlib.pyplot as plt
- from bs4 import BeautifulSoup, Tag, NavigableString
- from tqdm.notebook import tqdm
- HEADERS = {"User-Agent": "Mozilla/5.0 (Colab bulk script)"}
- # --- парсер одной страницы ---------------------------------------------------
- dash = "–—-"
- date_re = re.compile(
- rf"^\s*(?P<month>[A-Z][a-z]+)\s+(?P<day>\d{{1,2}})\s*[{dash}]\s*"
- )
- month_num = {m: i for i, m in enumerate(
- ["January","February","March","April","May","June",
- "July","August","September","October","November","December"], 1)}
- def fetch_year_events(year: int):
- url = f"https://en.wikipedia.org/wiki/{year}"
- html = requests.get(url, headers=HEADERS, timeout=30).text
- soup = BeautifulSoup(html, "html.parser")
- # ― найти заголовок «Events» ―
- anchor = soup.find(id="Events")
- if not anchor:
- return [] # нет секции → вернём пустой список
- events_h = anchor if anchor.name and anchor.name.startswith("h") \
- else anchor.find_parent(re.compile(r"h[1-6]"))
- if not events_h: # неудача найти заголовок
- return []
- level = int(events_h.name[1]) # <h2> → 2
- events = []
- for node in events_h.next_elements:
- if isinstance(node, Tag) and re.fullmatch(r"h[1-6]", node.name):
- if int(node.name[1]) <= level and node is not events_h:
- break # дошли до следующей большой секции
- if isinstance(node, Tag) and node.name == "li":
- text = node.get_text(" ", strip=True)
- m = date_re.match(text)
- if not m:
- continue # пропустим без точного дня
- month = month_num.get(m.group("month"))
- if not month:
- continue
- day = int(m.group("day"))
- date_iso = dt.date(year, month, day).isoformat()
- events.append((date_iso, date_re.sub("", text).strip()))
- return events
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement