from bs4 import BeautifulSoup
def extract_html_values(html_content, tags_to_extract=None):
"""
Extracts text values from HTML content.
Args:
html_content (str): The HTML content as a string.
tags_to_extract (list, optional): A list of HTML tags to extract values from.
If None, extracts from all tags. Defaults to None.
Returns:
dict: A dictionary where keys are tag names and values are lists of extracted text.
Returns an empty dictionary if html_content is empty or invalid.
"""
if not html_content:
return {}
soup = BeautifulSoup(html_content, 'html.parser')
extracted_data = {}
if tags_to_extract is None:
tags_to_extract = soup.find_all() # Extract from all tags if none specified
for tag in tags_to_extract:
tag_name = tag.name
text = tag.get_text(strip=True) # Extract text, removing leading/trailing whitespace
if text:
if tag_name not in extracted_data:
extracted_data[tag_name] = []
extracted_data[tag_name].append(text)
return extracted_data
if __name__ == '__main__':
# Example Usage
html = """
<html>
<head>
<title>My Webpage</title>
</head>
<body>
<h1>Heading</h1>
<p>This is a paragraph.</p>
<a href="https://example.com">Link</a>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</body>
</html>
"""
# Extract all values
all_values = extract_html_values(html)
print("All Values:", all_values)
# Extract only paragraph and link values
paragraph_link_values = extract_html_values(html, ['p', 'a'])
print("\nParagraph and Link Values:", paragraph_link_values)
Add your comment