1. from bs4 import BeautifulSoup
  2. import requests
  3. import re
  4. def clean_html(html_content):
  5. """
  6. Cleans HTML content for monitoring purposes.
  7. Removes script, style, and potentially irrelevant tags/attributes.
  8. """
  9. soup = BeautifulSoup(html_content, 'html.parser')
  10. # Remove script and style tags
  11. for script in soup(["script", "style"]):
  12. script.extract()
  13. # Remove comments
  14. for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
  15. comment.extract()
  16. # Remove potentially irrelevant tags - adjust the list as needed
  17. tags_to_remove = ['nav', 'footer', 'header', 'aside']
  18. for tag in tags_to_remove:
  19. for element in soup.find_all(tag):
  20. element.extract()
  21. # Remove extra whitespace and newlines
  22. text = soup.get_text()
  23. text = re.sub(r'\s+', ' ', text).strip() #replace multiple spaces with single space
  24. return text
  25. def fetch_and_clean_html(url):
  26. """
  27. Fetches HTML from a URL and cleans it.
  28. """
  29. try:
  30. response = requests.get(url)
  31. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  32. html_content = response.text
  33. cleaned_text = clean_html(html_content)
  34. return cleaned_text
  35. except requests.exceptions.RequestException as e:
  36. print(f"Error fetching URL {url}: {e}")
  37. return None
  38. if __name__ == '__main__':
  39. # Example Usage (replace with your monitoring URLs)
  40. url_list = [
  41. "https://www.example.com",
  42. "https://www.python.org"
  43. ]
  44. for url in url_list:
  45. cleaned_data = fetch_and_clean_html(url)
  46. if cleaned_data:
  47. print(f"Cleaned data from {url}:\n{cleaned_data[:500]}...\n") #print first 500 chars

Add your comment