Advertisement
iostream_h

Парсер событий (1 год)

May 6th, 2025
200
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.16 KB | None | 0 0
  1. import re
  2. import datetime as dt
  3. import requests, pandas as pd, matplotlib.pyplot as plt
  4. from bs4 import BeautifulSoup, Tag, NavigableString
  5. from tqdm.notebook import tqdm
  6.  
  7. HEADERS = {"User-Agent": "Mozilla/5.0 (Colab bulk script)"}
  8.  
  9. # --- парсер одной страницы ---------------------------------------------------
  10. dash = "–—-"
  11. date_re = re.compile(
  12.     rf"^\s*(?P<month>[A-Z][a-z]+)\s+(?P<day>\d{{1,2}})\s*[{dash}]\s*"
  13. )
  14. month_num = {m: i for i, m in enumerate(
  15.     ["January","February","March","April","May","June",
  16.      "July","August","September","October","November","December"], 1)}
  17.  
  18. def fetch_year_events(year: int):
  19.     url = f"https://en.wikipedia.org/wiki/{year}"
  20.     html = requests.get(url, headers=HEADERS, timeout=30).text
  21.     soup = BeautifulSoup(html, "html.parser")
  22.  
  23.     # ― найти заголовок «Events» ―
  24.     anchor = soup.find(id="Events")
  25.     if not anchor:
  26.         return []                        # нет секции → вернём пустой список
  27.  
  28.     events_h = anchor if anchor.name and anchor.name.startswith("h") \
  29.                else anchor.find_parent(re.compile(r"h[1-6]"))
  30.     if not events_h:                     # неудача найти заголовок
  31.         return []
  32.  
  33.     level = int(events_h.name[1])        # <h2> → 2
  34.     events = []
  35.  
  36.     for node in events_h.next_elements:
  37.         if isinstance(node, Tag) and re.fullmatch(r"h[1-6]", node.name):
  38.             if int(node.name[1]) <= level and node is not events_h:
  39.                 break                    # дошли до следующей большой секции
  40.         if isinstance(node, Tag) and node.name == "li":
  41.             text = node.get_text(" ", strip=True)
  42.             m = date_re.match(text)
  43.             if not m:
  44.                 continue                 # пропустим без точного дня
  45.             month = month_num.get(m.group("month"))
  46.             if not month:
  47.                 continue
  48.             day   = int(m.group("day"))
  49.             date_iso = dt.date(year, month, day).isoformat()
  50.             events.append((date_iso, date_re.sub("", text).strip()))
  51.     return events
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement
OSZAR »