import pandas as pd
def validate_data(df, validation_rules, overrides=None):
"""
Validates data in a Pandas DataFrame against predefined rules, allowing manual overrides.
Args:
df (pd.DataFrame): The DataFrame to validate.
validation_rules (dict): A dictionary defining validation rules for each column.
Example: {'column_name': {'type': 'int', 'min': 0, 'max': 100}}
overrides (dict, optional): A dictionary of specific overrides for individual rows.
Keys are row indices, values are dictionaries of overrides.
Defaults to None.
Returns:
pd.DataFrame: The validated DataFrame. Rows that fail validation are flagged with a 'validation_error' column.
"""
df['validation_error'] = False # Initialize validation error column
for col, rules in validation_rules.items():
col_errors = [] # Store errors for each column
for index, row in df.iterrows():
value = row[col]
if value is None:
if rules.get('required', False):
col_errors.append(f"Column '{col}' is required")
df.loc[index, 'validation_error'] = True
continue # Skip to next row if value is None and not required
if rules['type'] == 'int':
try:
value = int(value)
except ValueError:
col_errors.append(f"Column '{col}' must be an integer")
df.loc[index, 'validation_error'] = True
continue # Skip if not an integer
if 'min' in rules and value < rules['min']:
col_errors.append(f"Column '{col}' must be >= {rules['min']}")
df.loc[index, 'validation_error'] = True
if 'max' in rules and value > rules['max']:
col_errors.append(f"Column '{col}' must be <= {rules['max']}")
df.loc[index, 'validation_error'] = True
elif rules['type'] == 'float':
try:
value = float(value)
except ValueError:
col_errors.append(f"Column '{col}' must be a float")
df.loc[index, 'validation_error'] = True
continue
if 'min' in rules and value < rules['min']:
col_errors.append(f"Column '{col}' must be >= {rules['min']}")
df.loc[index, 'validation_error'] = True
if 'max' in rules and value > rules['max']:
col_errors.append(f"Column '{col}' must be <= {rules['max']}")
df.loc[index, 'validation_error'] = True
elif rules['type'] == 'string':
value = str(value).strip()
if 'min_length' in rules and len(value) < rules['min_length']:
col_errors.append(f"Column '{col}' must be at least {rules['min_length']} characters")
df.loc[index, 'validation_error'] = True
if 'max_length' in rules and len(value) > rules['max_length']:
col_errors.append(f"Column '{col}' must be at most {rules['max_length']} characters")
df.loc[index, 'validation_error'] = True
if 'pattern' in rules and not re.match(rules['pattern'], value):
col_errors.append(f"Column '{col}' must match pattern: {rules['pattern']}")
df.loc[index, 'validation_error'] = True
if col_errors:
df.loc[index, 'validation_error'] = col_errors # Store all errors in a single cell
if col_errors:
print(f"Validation errors for column '{col}': {col_errors}")
# Apply overrides
if overrides:
for index, override in overrides.items():
row = df.loc[index]
for col, value in override.items():
if col in validation_rules:
df.loc[index, col] = value
return df
import re #Import Regular expressions for string validation
if __
Add your comment