1. import re
  2. import pandas as pd
  3. def normalize_log_data(log_file_path, date_format='%Y-%m-%d %H:%M:%S'):
  4. """
  5. Normalizes data from a log file, extracting relevant fields and converting timestamps.
  6. Args:
  7. log_file_path (str): Path to the log file.
  8. date_format (str): Format of the date/time string in the log file.
  9. Returns:
  10. pandas.DataFrame: A DataFrame containing the normalized log data. Returns an empty DataFrame if the file is not found or empty.
  11. """
  12. try:
  13. with open(log_file_path, 'r') as f:
  14. log_lines = f.readlines()
  15. except FileNotFoundError:
  16. print(f"Error: Log file not found at {log_file_path}")
  17. return pd.DataFrame()
  18. if not log_lines:
  19. print("Log file is empty.")
  20. return pd.DataFrame()
  21. data = []
  22. for line in log_lines:
  23. # Use regex to extract timestamp and message (adjust pattern as needed)
  24. match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(?=\n|$)', line) #Capture timestamp until newline or end of line
  25. if match:
  26. timestamp_str = match.group(1)
  27. try:
  28. timestamp = pd.to_datetime(timestamp_str, format=date_format)
  29. except ValueError:
  30. timestamp = None # Handle invalid timestamps
  31. message = match.group(0).replace(timestamp_str, '', 1).strip() #remove timestamp from message
  32. data.append([timestamp, message])
  33. df = pd.DataFrame(data, columns=['timestamp', 'message'])
  34. #Basic sanity checks
  35. if df.empty:
  36. print("No data extracted from the log file.")
  37. return pd.DataFrame()
  38. #Check for timestamp validity
  39. if df['timestamp'].isnull().any():
  40. print("Warning: Invalid timestamps found. Check date format and log file content.")
  41. return df
  42. if __name__ == '__main__':
  43. #Example usage
  44. log_file = 'sample.log'
  45. # Create a dummy log file for testing
  46. with open(log_file, 'w') as f:
  47. f.write('2023-11-15 10:00:00 This is a log message.\n')
  48. f.write('2023-11-15 10:05:00 Another log entry.\n')
  49. f.write('Invalid Timestamp\n')
  50. f.write('2023-11-15 10:10:00 Yet another message.\n')
  51. normalized_df = normalize_log_data(log_file)
  52. print(normalized_df.head())

Add your comment