1. import textract
  2. def strip_metadata(text):
  3. """
  4. Strips metadata from text blocks.
  5. Args:
  6. text (str): The text to strip metadata from.
  7. Returns:
  8. str: The text with metadata removed. Returns original text if an error occurs.
  9. """
  10. try:
  11. # Use textract to extract text, stripping metadata
  12. text = textract.process(text).decode('utf-8')
  13. return text
  14. except Exception as e:
  15. print(f"Error stripping metadata: {e}")
  16. return text # Return original text on error
  17. if __name__ == '__main__':
  18. # Example usage
  19. text_with_metadata = """
  20. This is a test document.
  21. Author: John Doe
  22. Date: 2023-10-27
  23. Title: Test Document
  24. """
  25. stripped_text = strip_metadata(text_with_metadata)
  26. print("Original text:\n", text_with_metadata)
  27. print("\nStripped text:\n", stripped_text)
  28. #Example with a file
  29. try:
  30. with open("test.pdf", "rb") as f:
  31. pdf_text = textract.process(f).decode('utf-8')
  32. stripped_pdf_text = strip_metadata(pdf_text)
  33. print("\nStripped PDF text:\n", stripped_pdf_text)
  34. except FileNotFoundError:
  35. print("test.pdf not found.")
  36. except Exception as e:
  37. print(f"Error processing PDF: {e}")

Add your comment