1. from bs4 import BeautifulSoup
  2. def extract_html_values(html_content, tags_to_extract=None):
  3. """
  4. Extracts text values from HTML content.
  5. Args:
  6. html_content (str): The HTML content as a string.
  7. tags_to_extract (list, optional): A list of HTML tags to extract values from.
  8. If None, extracts from all tags. Defaults to None.
  9. Returns:
  10. dict: A dictionary where keys are tag names and values are lists of extracted text.
  11. Returns an empty dictionary if html_content is empty or invalid.
  12. """
  13. if not html_content:
  14. return {}
  15. soup = BeautifulSoup(html_content, 'html.parser')
  16. extracted_data = {}
  17. if tags_to_extract is None:
  18. tags_to_extract = soup.find_all() # Extract from all tags if none specified
  19. for tag in tags_to_extract:
  20. tag_name = tag.name
  21. text = tag.get_text(strip=True) # Extract text, removing leading/trailing whitespace
  22. if text:
  23. if tag_name not in extracted_data:
  24. extracted_data[tag_name] = []
  25. extracted_data[tag_name].append(text)
  26. return extracted_data
  27. if __name__ == '__main__':
  28. # Example Usage
  29. html = """
  30. <html>
  31. <head>
  32. <title>My Webpage</title>
  33. </head>
  34. <body>
  35. <h1>Heading</h1>
  36. <p>This is a paragraph.</p>
  37. <a href="https://example.com">Link</a>
  38. <ul>
  39. <li>Item 1</li>
  40. <li>Item 2</li>
  41. </ul>
  42. </body>
  43. </html>
  44. """
  45. # Extract all values
  46. all_values = extract_html_values(html)
  47. print("All Values:", all_values)
  48. # Extract only paragraph and link values
  49. paragraph_link_values = extract_html_values(html, ['p', 'a'])
  50. print("\nParagraph and Link Values:", paragraph_link_values)

Add your comment