import html2text
from bs4 import BeautifulSoup
import json
def convert_html(input_html, output_format="text", default_values=None):
"""
Converts HTML to various formats with default values.
Args:
input_html (str): The HTML content to convert.
output_format (str): The desired output format ("text", "markdown", "json"). Defaults to "text".
default_values (dict): Dictionary of default values to use if elements are missing.
Returns:
str or dict: The converted content as a string or dictionary, depending on the output format.
Returns None if the output format is invalid.
"""
if output_format == "text":
try:
h = html2text.HTML2Text()
h.ignore_links = True # Optional: ignore links
h.ignore_images = True #Optional : ignore images
text = h.handle(input_html)
return text
except Exception as e:
print(f"Error converting to text: {e}")
return None
elif output_format == "markdown":
try:
soup = BeautifulSoup(input_html, 'html.parser')
return soup.prettify(formatter='html')
except Exception as e:
print(f"Error converting to markdown: {e}")
return None
elif output_format == "json":
try:
soup = BeautifulSoup(input_html, 'html.parser')
data = {}
for element in soup.find_all():
tag = element.name
attributes = element.attrs
text = element.get_text(strip=True)
if tag not in data:
data[tag] = []
data[tag].append({
"attributes": attributes,
"text": text
})
return json.dumps(data, indent=4)
except Exception as e:
print(f"Error converting to JSON: {e}")
return None
else:
print("Invalid output format.")
return None
if __name__ == '__main__':
# Example usage
html_content = """
<html>
<head>
<title>My Page</title>
</head>
<body>
<h1>Hello, world!</h1>
<p>This is a paragraph.</p>
<a href="https://example.com">Example Link</a>
<div>
<p>Some content inside a div.</p>
</div>
</body>
</html>
"""
# Convert to text
text_output = convert_html(html_content, output_format="text")
if text_output:
print("Text Output:\n", text_output)
# Convert to markdown
markdown_output = convert_html(html_content, output_format="markdown")
if markdown_output:
print("\nMarkdown Output:\n", markdown_output)
# Convert to JSON
json_output = convert_html(html_content, output_format="json")
if json_output:
print("\nJSON Output:\n", json_output)
Add your comment