1. from bs4 import BeautifulSoup
  2. def split_html_data(html_string, split_by="div"):
  3. """
  4. Splits HTML data into a list of strings based on the specified tag.
  5. Returns a list of strings.
  6. """
  7. try:
  8. soup = BeautifulSoup(html_string, 'html.parser')
  9. if split_by == "div":
  10. # Split by <div> tags
  11. elements = soup.find_all('div')
  12. data_list = [str(element) for element in elements]
  13. elif split_by == "p":
  14. # Split by <p> tags
  15. elements = soup.find_all('p')
  16. data_list = [str(element) for element in elements]
  17. else:
  18. raise ValueError("Invalid split_by tag. Choose 'div' or 'p'.")
  19. return data_list
  20. except Exception as e:
  21. print(f"Error splitting HTML: {e}")
  22. return [] # Return an empty list on error
  23. if __name__ == '__main__':
  24. # Example Usage:
  25. html_data = """
  26. <div>
  27. <p>This is the first paragraph.</p>
  28. <p>This is the second paragraph.</p>
  29. </div>
  30. <div>
  31. <p>Another paragraph.</p>
  32. </div>
  33. """
  34. split_data = split_html_data(html_data, split_by="div")
  35. print(split_data)
  36. split_data = split_html_data(html_data, split_by="p")
  37. print(split_data)
  38. #Example with invalid split
  39. split_data = split_html_data(html_data, split_by="span")
  40. print(split_data)

Add your comment