1. import re
  2. import logging
  3. # Configure logging
  4. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  5. def normalize_text(input_file, output_file):
  6. """
  7. Normalizes text data from an input file, addressing common issues like:
  8. - Lowercasing
  9. - Removing punctuation
  10. - Removing extra whitespace
  11. - Replacing special characters
  12. - Error logging for problematic lines.
  13. """
  14. try:
  15. with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
  16. for line_num, line in enumerate(infile, 1): # Enumerate for line numbers
  17. try:
  18. # Lowercase the line
  19. line = line.lower()
  20. # Remove punctuation
  21. line = re.sub(r'[^\w\s]', '', line)
  22. # Remove extra whitespace
  23. line = re.sub(r'\s+', ' ', line).strip()
  24. # Replace special characters (example: keep only alphanumeric and spaces)
  25. line = re.sub(r'[^a-z0-9\s]', '', line)
  26. # Write the normalized line to the output file
  27. outfile.write(line)
  28. except Exception as e:
  29. logging.error(f"Error processing line {line_num}: {e}. Line: {line.strip()}")
  30. except FileNotFoundError:
  31. logging.error(f"Input file not found: {input_file}")
  32. except Exception as e:
  33. logging.error(f"An unexpected error occurred: {e}")
  34. if __name__ == '__main__':
  35. # Example usage:
  36. input_file = 'input.txt' # Replace with your input file
  37. output_file = 'output.txt' # Replace with your desired output file
  38. normalize_text(input_file, output_file)
  39. print(f"Text normalization complete. Output written to {output_file}")

Add your comment