import re
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def normalize_text(input_file, output_file):
"""
Normalizes text data from an input file, addressing common issues like:
- Lowercasing
- Removing punctuation
- Removing extra whitespace
- Replacing special characters
- Error logging for problematic lines.
"""
try:
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
for line_num, line in enumerate(infile, 1): # Enumerate for line numbers
try:
# Lowercase the line
line = line.lower()
# Remove punctuation
line = re.sub(r'[^\w\s]', '', line)
# Remove extra whitespace
line = re.sub(r'\s+', ' ', line).strip()
# Replace special characters (example: keep only alphanumeric and spaces)
line = re.sub(r'[^a-z0-9\s]', '', line)
# Write the normalized line to the output file
outfile.write(line)
except Exception as e:
logging.error(f"Error processing line {line_num}: {e}. Line: {line.strip()}")
except FileNotFoundError:
logging.error(f"Input file not found: {input_file}")
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
if __name__ == '__main__':
# Example usage:
input_file = 'input.txt' # Replace with your input file
output_file = 'output.txt' # Replace with your desired output file
normalize_text(input_file, output_file)
print(f"Text normalization complete. Output written to {output_file}")
Add your comment