import requests
from bs4 import BeautifulSoup
def strip_form_metadata(url, max_length=200, max_words=10):
"""
Strips metadata from web forms on a given URL, limiting length and word count.
Args:
url (str): The URL of the webpage containing the form.
max_length (int): Maximum character length of the stripped text.
max_words (int): Maximum number of words in the stripped text.
Returns:
str: The stripped form text. Returns None if an error occurs.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
soup = BeautifulSoup(response.content, 'html.parser')
forms = soup.find_all('form')
stripped_text = ""
for form in forms:
#Extract text from form elements
form_text = form.get_text()
#Strip metadata - remove leading/trailing whitespace and newlines
form_text = form_text.strip()
# Limit the length of the stripped text
if len(form_text) > max_length:
form_text = form_text[:max_length] + "..."
# Limit the number of words
words = form_text.split()
if len(words) > max_words:
form_text = " ".join(words[:max_words]) + "..."
stripped_text += form_text + "\n" # Add newline between forms
return stripped_text.strip() #remove trailing newline
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
if __name__ == '__main__':
# Example Usage
url = "https://www.example.com/form" # Replace with your target URL
stripped_form = strip_form_metadata(url)
if stripped_form:
print(stripped_form)
Add your comment