Парсер событий (1 год)

iostream_h

May 6th, 2025

200

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

Python 2.16 KB | None | 0 0

raw download clone embed print report

import re
import datetime as dt
import requests, pandas as pd, matplotlib.pyplot as plt
from bs4 import BeautifulSoup, Tag, NavigableString
from tqdm.notebook import tqdm
HEADERS = {"User-Agent": "Mozilla/5.0 (Colab bulk script)"}
# --- парсер одной страницы ---------------------------------------------------
dash = "–—-"
date_re = re.compile(
rf"^\s*(?P<month>[A-Z][a-z]+)\s+(?P<day>\d{{1,2}})\s*[{dash}]\s*"
)
month_num = {m: i for i, m in enumerate(
["January","February","March","April","May","June",
"July","August","September","October","November","December"], 1)}
def fetch_year_events(year: int):
url = f"https://en.wikipedia.org/wiki/{year}"
html = requests.get(url, headers=HEADERS, timeout=30).text
soup = BeautifulSoup(html, "html.parser")
# ― найти заголовок «Events» ―
anchor = soup.find(id="Events")
if not anchor:
return [] # нет секции → вернём пустой список
events_h = anchor if anchor.name and anchor.name.startswith("h") \
else anchor.find_parent(re.compile(r"h[1-6]"))
if not events_h: # неудача найти заголовок
return []
level = int(events_h.name[1]) # <h2> → 2
events = []
for node in events_h.next_elements:
if isinstance(node, Tag) and re.fullmatch(r"h[1-6]", node.name):
if int(node.name[1]) <= level and node is not events_h:
break # дошли до следующей большой секции
if isinstance(node, Tag) and node.name == "li":
text = node.get_text(" ", strip=True)
m = date_re.match(text)
if not m:
continue # пропустим без точного дня
month = month_num.get(m.group("month"))
if not month:
continue
day = int(m.group("day"))
date_iso = dt.date(year, month, day).isoformat()
events.append((date_iso, date_re.sub("", text).strip()))
return events

Add Comment

Please, Sign In to add comment