1. import os
  2. import time
  3. import logging
  4. from bs4 import BeautifulSoup
  5. from urllib.parse import urlparse
  6. import requests
  7. # Configure logging
  8. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  9. def release_html_resources(url_list, dry_run=True, delay=1):
  10. """
  11. Releases resources of HTML documents from a list of URLs.
  12. Args:
  13. url_list (list): A list of URLs to process.
  14. dry_run (bool): If True, only simulate the resource release.
  15. delay (int): Delay in seconds between requests.
  16. """
  17. for url in url_list:
  18. try:
  19. if not dry_run:
  20. logging.info(f"Processing URL: {url}")
  21. response = requests.get(url, timeout=10)
  22. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  23. soup = BeautifulSoup(response.content, 'html.parser')
  24. # Example resource release - remove script tags
  25. for script in soup.find_all('script'):
  26. script.decompose()
  27. # Example resource release - remove style tags
  28. for style in soup.find_all('style'):
  29. style.decompose()
  30. # Example resource release - remove images
  31. for img in soup.find_all('img'):
  32. img.decompose()
  33. # Save modified HTML (optional)
  34. with open(f"temp_{url.split('//')[1].split('/')[0]}.html", "w", encoding="utf-8") as f:
  35. f.write(str(soup))
  36. logging.info(f"Released resources from: {url}")
  37. else:
  38. logging.info(f"Dry run: Would process URL: {url}")
  39. time.sleep(delay) # simulate processing time
  40. except requests.exceptions.RequestException as e:
  41. logging.error(f"Request error for {url}: {e}")
  42. except Exception as e:
  43. logging.error(f"Error processing {url}: {e}")
  44. time.sleep(delay) # Delay between requests
  45. if __name__ == '__main__':
  46. # Example Usage
  47. urls = [
  48. "https://www.example.com",
  49. "https://www.python.org",
  50. "https://www.google.com"
  51. ]
  52. release_html_resources(urls, dry_run=True, delay=2) # Set dry_run to False to actually release resources
  53. #release_html_resources(urls, dry_run=False, delay=2) # To actually release resources

Add your comment