1. import html2text
  2. from bs4 import BeautifulSoup
  3. import json
  4. def convert_html(input_html, output_format="text", default_values=None):
  5. """
  6. Converts HTML to various formats with default values.
  7. Args:
  8. input_html (str): The HTML content to convert.
  9. output_format (str): The desired output format ("text", "markdown", "json"). Defaults to "text".
  10. default_values (dict): Dictionary of default values to use if elements are missing.
  11. Returns:
  12. str or dict: The converted content as a string or dictionary, depending on the output format.
  13. Returns None if the output format is invalid.
  14. """
  15. if output_format == "text":
  16. try:
  17. h = html2text.HTML2Text()
  18. h.ignore_links = True # Optional: ignore links
  19. h.ignore_images = True #Optional : ignore images
  20. text = h.handle(input_html)
  21. return text
  22. except Exception as e:
  23. print(f"Error converting to text: {e}")
  24. return None
  25. elif output_format == "markdown":
  26. try:
  27. soup = BeautifulSoup(input_html, 'html.parser')
  28. return soup.prettify(formatter='html')
  29. except Exception as e:
  30. print(f"Error converting to markdown: {e}")
  31. return None
  32. elif output_format == "json":
  33. try:
  34. soup = BeautifulSoup(input_html, 'html.parser')
  35. data = {}
  36. for element in soup.find_all():
  37. tag = element.name
  38. attributes = element.attrs
  39. text = element.get_text(strip=True)
  40. if tag not in data:
  41. data[tag] = []
  42. data[tag].append({
  43. "attributes": attributes,
  44. "text": text
  45. })
  46. return json.dumps(data, indent=4)
  47. except Exception as e:
  48. print(f"Error converting to JSON: {e}")
  49. return None
  50. else:
  51. print("Invalid output format.")
  52. return None
  53. if __name__ == '__main__':
  54. # Example usage
  55. html_content = """
  56. <html>
  57. <head>
  58. <title>My Page</title>
  59. </head>
  60. <body>
  61. <h1>Hello, world!</h1>
  62. <p>This is a paragraph.</p>
  63. <a href="https://example.com">Example Link</a>
  64. <div>
  65. <p>Some content inside a div.</p>
  66. </div>
  67. </body>
  68. </html>
  69. """
  70. # Convert to text
  71. text_output = convert_html(html_content, output_format="text")
  72. if text_output:
  73. print("Text Output:\n", text_output)
  74. # Convert to markdown
  75. markdown_output = convert_html(html_content, output_format="markdown")
  76. if markdown_output:
  77. print("\nMarkdown Output:\n", markdown_output)
  78. # Convert to JSON
  79. json_output = convert_html(html_content, output_format="json")
  80. if json_output:
  81. print("\nJSON Output:\n", json_output)

Add your comment