from bs4 import BeautifulSoup
import requests
import re
def clean_html(html_content):
"""
Cleans HTML content for monitoring purposes.
Removes script, style, and potentially irrelevant tags/attributes.
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style tags
for script in soup(["script", "style"]):
script.extract()
# Remove comments
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove potentially irrelevant tags - adjust the list as needed
tags_to_remove = ['nav', 'footer', 'header', 'aside']
for tag in tags_to_remove:
for element in soup.find_all(tag):
element.extract()
# Remove extra whitespace and newlines
text = soup.get_text()
text = re.sub(r'\s+', ' ', text).strip() #replace multiple spaces with single space
return text
def fetch_and_clean_html(url):
"""
Fetches HTML from a URL and cleans it.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
html_content = response.text
cleaned_text = clean_html(html_content)
return cleaned_text
except requests.exceptions.RequestException as e:
print(f"Error fetching URL {url}: {e}")
return None
if __name__ == '__main__':
# Example Usage (replace with your monitoring URLs)
url_list = [
"https://www.example.com",
"https://www.python.org"
]
for url in url_list:
cleaned_data = fetch_and_clean_html(url)
if cleaned_data:
print(f"Cleaned data from {url}:\n{cleaned_data[:500]}...\n") #print first 500 chars
Add your comment