import re
import pandas as pd
def normalize_log_data(log_file_path, date_format='%Y-%m-%d %H:%M:%S'):
"""
Normalizes data from a log file, extracting relevant fields and converting timestamps.
Args:
log_file_path (str): Path to the log file.
date_format (str): Format of the date/time string in the log file.
Returns:
pandas.DataFrame: A DataFrame containing the normalized log data. Returns an empty DataFrame if the file is not found or empty.
"""
try:
with open(log_file_path, 'r') as f:
log_lines = f.readlines()
except FileNotFoundError:
print(f"Error: Log file not found at {log_file_path}")
return pd.DataFrame()
if not log_lines:
print("Log file is empty.")
return pd.DataFrame()
data = []
for line in log_lines:
# Use regex to extract timestamp and message (adjust pattern as needed)
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(?=\n|$)', line) #Capture timestamp until newline or end of line
if match:
timestamp_str = match.group(1)
try:
timestamp = pd.to_datetime(timestamp_str, format=date_format)
except ValueError:
timestamp = None # Handle invalid timestamps
message = match.group(0).replace(timestamp_str, '', 1).strip() #remove timestamp from message
data.append([timestamp, message])
df = pd.DataFrame(data, columns=['timestamp', 'message'])
#Basic sanity checks
if df.empty:
print("No data extracted from the log file.")
return pd.DataFrame()
#Check for timestamp validity
if df['timestamp'].isnull().any():
print("Warning: Invalid timestamps found. Check date format and log file content.")
return df
if __name__ == '__main__':
#Example usage
log_file = 'sample.log'
# Create a dummy log file for testing
with open(log_file, 'w') as f:
f.write('2023-11-15 10:00:00 This is a log message.\n')
f.write('2023-11-15 10:05:00 Another log entry.\n')
f.write('Invalid Timestamp\n')
f.write('2023-11-15 10:10:00 Yet another message.\n')
normalized_df = normalize_log_data(log_file)
print(normalized_df.head())
Add your comment