import textract
def strip_metadata(text):
"""
Strips metadata from text blocks.
Args:
text (str): The text to strip metadata from.
Returns:
str: The text with metadata removed. Returns original text if an error occurs.
"""
try:
# Use textract to extract text, stripping metadata
text = textract.process(text).decode('utf-8')
return text
except Exception as e:
print(f"Error stripping metadata: {e}")
return text # Return original text on error
if __name__ == '__main__':
# Example usage
text_with_metadata = """
This is a test document.
Author: John Doe
Date: 2023-10-27
Title: Test Document
"""
stripped_text = strip_metadata(text_with_metadata)
print("Original text:\n", text_with_metadata)
print("\nStripped text:\n", stripped_text)
#Example with a file
try:
with open("test.pdf", "rb") as f:
pdf_text = textract.process(f).decode('utf-8')
stripped_pdf_text = strip_metadata(pdf_text)
print("\nStripped PDF text:\n", stripped_pdf_text)
except FileNotFoundError:
print("test.pdf not found.")
except Exception as e:
print(f"Error processing PDF: {e}")
Add your comment