import os
import time
import logging
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def release_html_resources(url_list, dry_run=True, delay=1):
"""
Releases resources of HTML documents from a list of URLs.
Args:
url_list (list): A list of URLs to process.
dry_run (bool): If True, only simulate the resource release.
delay (int): Delay in seconds between requests.
"""
for url in url_list:
try:
if not dry_run:
logging.info(f"Processing URL: {url}")
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
soup = BeautifulSoup(response.content, 'html.parser')
# Example resource release - remove script tags
for script in soup.find_all('script'):
script.decompose()
# Example resource release - remove style tags
for style in soup.find_all('style'):
style.decompose()
# Example resource release - remove images
for img in soup.find_all('img'):
img.decompose()
# Save modified HTML (optional)
with open(f"temp_{url.split('//')[1].split('/')[0]}.html", "w", encoding="utf-8") as f:
f.write(str(soup))
logging.info(f"Released resources from: {url}")
else:
logging.info(f"Dry run: Would process URL: {url}")
time.sleep(delay) # simulate processing time
except requests.exceptions.RequestException as e:
logging.error(f"Request error for {url}: {e}")
except Exception as e:
logging.error(f"Error processing {url}: {e}")
time.sleep(delay) # Delay between requests
if __name__ == '__main__':
# Example Usage
urls = [
"https://www.example.com",
"https://www.python.org",
"https://www.google.com"
]
release_html_resources(urls, dry_run=True, delay=2) # Set dry_run to False to actually release resources
#release_html_resources(urls, dry_run=False, delay=2) # To actually release resources
Add your comment