1. import requests
  2. from bs4 import BeautifulSoup
  3. def strip_form_metadata(url, max_length=200, max_words=10):
  4. """
  5. Strips metadata from web forms on a given URL, limiting length and word count.
  6. Args:
  7. url (str): The URL of the webpage containing the form.
  8. max_length (int): Maximum character length of the stripped text.
  9. max_words (int): Maximum number of words in the stripped text.
  10. Returns:
  11. str: The stripped form text. Returns None if an error occurs.
  12. """
  13. try:
  14. response = requests.get(url)
  15. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  16. soup = BeautifulSoup(response.content, 'html.parser')
  17. forms = soup.find_all('form')
  18. stripped_text = ""
  19. for form in forms:
  20. #Extract text from form elements
  21. form_text = form.get_text()
  22. #Strip metadata - remove leading/trailing whitespace and newlines
  23. form_text = form_text.strip()
  24. # Limit the length of the stripped text
  25. if len(form_text) > max_length:
  26. form_text = form_text[:max_length] + "..."
  27. # Limit the number of words
  28. words = form_text.split()
  29. if len(words) > max_words:
  30. form_text = " ".join(words[:max_words]) + "..."
  31. stripped_text += form_text + "\n" # Add newline between forms
  32. return stripped_text.strip() #remove trailing newline
  33. except requests.exceptions.RequestException as e:
  34. print(f"Error fetching URL: {e}")
  35. return None
  36. except Exception as e:
  37. print(f"An error occurred: {e}")
  38. return None
  39. if __name__ == '__main__':
  40. # Example Usage
  41. url = "https://www.example.com/form" # Replace with your target URL
  42. stripped_form = strip_form_metadata(url)
  43. if stripped_form:
  44. print(stripped_form)

Add your comment