Advertisement
Jexal

6a663d51-9460-41fb-8aa1-4415ada43c4d

May 7th, 2025
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.13 KB | None | 0 0
  1. import requests
  2. import os
  3. import csv
  4. import logging
  5. import threading
  6. import sys
  7. import time
  8. import yaml
  9. import shutil
  10. from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, SpinnerColumn
  11. from concurrent.futures import ThreadPoolExecutor, as_completed
  12. from rich.console import Console
  13. from colorama import Fore, Style, init
  14. from rich.traceback import install
  15. import webbrowser
  16.  
  17. # Initialize modules
  18. install()
  19. console = Console()
  20. init(autoreset=True)
  21.  
  22. # Logging setup
  23. LOG_FILE = "Waylink.log"
  24. logging.basicConfig(
  25.     filename=LOG_FILE,
  26.     level=logging.DEBUG,
  27.     format="%(asctime)s - %(levelname)s - %(message)s",
  28.     datefmt="%Y-%m-%d %H:%M:%S"
  29. )
  30. logging.info("Script initialized.")
  31.  
  32.  
  33. class WaybackMachineArchiver:
  34.     def __init__(self, config_file="config.yaml"):
  35.         """Loads configuration and initializes paths."""
  36.         try:
  37.             with open(config_file, "r") as file:
  38.                 config = yaml.safe_load(file)
  39.                 if not isinstance(config, dict):
  40.                     raise ValueError("Configuration file must be a dictionary.")
  41.         except Exception as e:
  42.             console.print(f"[red]Failed to load configuration: {e}[/red]")
  43.             logging.exception("Config loading error.")
  44.             exit(1)
  45.  
  46.         self.input_file = config.get("input_file", "urls.txt")
  47.         self.output_dir = config.get("output_dir", "output")
  48.         self.max_workers = max(1, config.get("max_workers", os.cpu_count() or 4))
  49.         self.retries = config.get("retries", 3)
  50.         self.timeout = config.get("timeout", 10)
  51.         self.initial_delay = config.get("initial_delay", 1)
  52.         self.max_delay = config.get("max_delay", 16)
  53.         self.verbose = config.get("verbose", True)
  54.         self.open_output_dir_on_completion = config.get("open_output_dir_on_completion", False)
  55.         self.save_summary = config.get("save_summary", True)
  56.  
  57.         os.makedirs(self.output_dir, exist_ok=True)
  58.         self.executor = ThreadPoolExecutor(max_workers=self.max_workers)
  59.         self.session = requests.Session()
  60.  
  61.     def shutdown_executors(self):
  62.         self.executor.shutdown(wait=True)
  63.         self.session.close()
  64.         logging.info("Executors and session shut down.")
  65.  
  66.     def backup_input_file(self):
  67.         try:
  68.             backup_file = self.input_file + ".bak"
  69.             shutil.copyfile(self.input_file, backup_file)
  70.             logging.info(f"Backup created: {backup_file}")
  71.         except Exception as e:
  72.             logging.error(f"Failed to backup input file {self.input_file}: {e}")
  73.  
  74.     def attempt_url(self, url):
  75.         attempt = 1
  76.         delay = self.initial_delay
  77.  
  78.         while attempt <= self.retries:
  79.             try:
  80.                 full_url = f"http://web.archive.org/save/{url}"
  81.                 response = self.session.get(full_url, timeout=self.timeout, allow_redirects=True)
  82.                 status = response.status_code
  83.  
  84.                 logging.debug(f"URL: {url} - Response: {status} (Attempt {attempt})")
  85.                 if self.verbose:
  86.                     console.print(f"[cyan]URL: {url} - Status: {status} (Attempt {attempt})[/cyan]")
  87.  
  88.                 if status == 200:
  89.                     return {'url': url, 'status': 200, 'error': None}
  90.                 elif status == 403:
  91.                     return {'url': url, 'status': 403, 'error': "Excluded"}
  92.                 elif status == 429:
  93.                     logging.warning(f"Rate-limited for URL: {url} - Attempt {attempt}")
  94.                     time.sleep(delay)
  95.                     delay = min(delay * 2, self.max_delay)
  96.                     attempt += 1
  97.                     continue
  98.  
  99.             except requests.exceptions.RequestException as e:
  100.                 logging.error(f"Request failed for {url} (Attempt {attempt}): {e}")
  101.                 time.sleep(delay)
  102.                 delay = min(delay * 2, self.max_delay)
  103.                 attempt += 1
  104.  
  105.         return {'url': url, 'status': None, 'error': "Max retries exceeded"}
  106.  
  107.     def process_all_urls(self, urls):
  108.         results = []
  109.         future_to_url = {self.executor.submit(self.attempt_url, url): url for url in urls}
  110.  
  111.         with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TimeRemainingColumn()) as progress:
  112.             task = progress.add_task("[cyan]Archiving URLs...", total=len(urls))
  113.  
  114.             for future in as_completed(future_to_url):
  115.                 url = future_to_url[future]
  116.                 try:
  117.                     result = future.result(timeout=self.timeout * self.retries)
  118.                     results.append(result)
  119.                 except Exception as exc:
  120.                     logging.error(f"{url} generated an exception: {exc}")
  121.                     results.append({'url': url, 'status': None, 'error': str(exc)})
  122.                 finally:
  123.                     progress.update(task, advance=1)
  124.  
  125.         return results
  126.  
  127.     def handle_result(self, result):
  128.         url, status = result['url'], result['status']
  129.         if status == 200:
  130.             console.print(f"[green]Saved: {url}[/green]")
  131.         elif status == 403:
  132.             console.print(f"[yellow]Excluded: {url} (403)[/yellow]")
  133.         else:
  134.             console.print(f"[red]Failed: {url} ({result.get('error')})[/red]")
  135.  
  136.     def write_summary(self, saved, excluded, failed, execution_time):
  137.         summary_file = os.path.join(self.output_dir, "summary.csv")
  138.         try:
  139.             with open(summary_file, "w", newline="", encoding="utf-8") as f:
  140.                 writer = csv.writer(f)
  141.                 writer.writerow(["Status", "URL"])
  142.                 for url in saved:
  143.                     writer.writerow(["Saved", url])
  144.                 for url in excluded:
  145.                     writer.writerow(["Excluded", url])
  146.                 for url in failed:
  147.                     writer.writerow(["Failed", url])
  148.                 writer.writerow([])
  149.                 writer.writerow(["Total time (s)", round(execution_time, 2)])
  150.  
  151.             logging.info(f"Summary saved to: {summary_file}")
  152.         except Exception as e:
  153.             logging.error(f"Error writing summary: {e}")
  154.  
  155.     def summarize_results(self, results, execution_time):
  156.         saved = [r['url'] for r in results if r['status'] == 200]
  157.         excluded = [r['url'] for r in results if r['status'] == 403]
  158.         failed = [r['url'] for r in results if r['status'] not in (200, 403)]
  159.  
  160.         console.print("\n[bold green]Summary:[/bold green]")
  161.         if saved:
  162.             console.print(f"[green]Saved ({len(saved)}):[/green]")
  163.             for url in saved:
  164.                 console.print(f"[green]{url}[/green]")
  165.  
  166.         if excluded:
  167.             console.print(f"\n[yellow]Excluded ({len(excluded)}):[/yellow]")
  168.             for url in excluded:
  169.                 console.print(f"[yellow]{url}[/yellow]")
  170.  
  171.         if failed:
  172.             console.print(f"\n[red]Failed ({len(failed)}):[/red]")
  173.             for url in failed:
  174.                 console.print(f"[red]{url}[/red]")
  175.  
  176.         if self.save_summary:
  177.             self.write_summary(saved, excluded, failed, execution_time)
  178.  
  179.         if self.open_output_dir_on_completion:
  180.             try:
  181.                 webbrowser.open(self.output_dir)
  182.             except Exception as e:
  183.                 logging.warning(f"Failed to open output directory: {e}")
  184.  
  185.  
  186. if __name__ == "__main__":
  187.     archiver = WaybackMachineArchiver(config_file="config.yaml")
  188.  
  189.     try:
  190.         archiver.backup_input_file()
  191.         with open(archiver.input_file, "r") as file:
  192.             urls = list(dict.fromkeys(line.strip() for line in file if line.strip()))  # Deduplicated, order preserved
  193.         if not urls:
  194.             console.print("[yellow]No URLs found in the input file.[/yellow]")
  195.             sys.exit(0)
  196.  
  197.         start = time.time()
  198.         results = archiver.process_all_urls(urls)
  199.         duration = time.time() - start
  200.  
  201.         for result in results:
  202.             archiver.handle_result(result)
  203.  
  204.         archiver.summarize_results(results, duration)
  205.  
  206.     finally:
  207.         archiver.shutdown_executors()
  208.  
  209.     console.input("\n[cyan]Press [bold cyan]Enter[/bold cyan] to exit...[/cyan]")
  210.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement
OSZAR »