1. import pandas as pd
  2. def validate_data(df, validation_rules, overrides=None):
  3. """
  4. Validates data in a Pandas DataFrame against predefined rules, allowing manual overrides.
  5. Args:
  6. df (pd.DataFrame): The DataFrame to validate.
  7. validation_rules (dict): A dictionary defining validation rules for each column.
  8. Example: {'column_name': {'type': 'int', 'min': 0, 'max': 100}}
  9. overrides (dict, optional): A dictionary of specific overrides for individual rows.
  10. Keys are row indices, values are dictionaries of overrides.
  11. Defaults to None.
  12. Returns:
  13. pd.DataFrame: The validated DataFrame. Rows that fail validation are flagged with a 'validation_error' column.
  14. """
  15. df['validation_error'] = False # Initialize validation error column
  16. for col, rules in validation_rules.items():
  17. col_errors = [] # Store errors for each column
  18. for index, row in df.iterrows():
  19. value = row[col]
  20. if value is None:
  21. if rules.get('required', False):
  22. col_errors.append(f"Column '{col}' is required")
  23. df.loc[index, 'validation_error'] = True
  24. continue # Skip to next row if value is None and not required
  25. if rules['type'] == 'int':
  26. try:
  27. value = int(value)
  28. except ValueError:
  29. col_errors.append(f"Column '{col}' must be an integer")
  30. df.loc[index, 'validation_error'] = True
  31. continue # Skip if not an integer
  32. if 'min' in rules and value < rules['min']:
  33. col_errors.append(f"Column '{col}' must be >= {rules['min']}")
  34. df.loc[index, 'validation_error'] = True
  35. if 'max' in rules and value > rules['max']:
  36. col_errors.append(f"Column '{col}' must be <= {rules['max']}")
  37. df.loc[index, 'validation_error'] = True
  38. elif rules['type'] == 'float':
  39. try:
  40. value = float(value)
  41. except ValueError:
  42. col_errors.append(f"Column '{col}' must be a float")
  43. df.loc[index, 'validation_error'] = True
  44. continue
  45. if 'min' in rules and value < rules['min']:
  46. col_errors.append(f"Column '{col}' must be >= {rules['min']}")
  47. df.loc[index, 'validation_error'] = True
  48. if 'max' in rules and value > rules['max']:
  49. col_errors.append(f"Column '{col}' must be <= {rules['max']}")
  50. df.loc[index, 'validation_error'] = True
  51. elif rules['type'] == 'string':
  52. value = str(value).strip()
  53. if 'min_length' in rules and len(value) < rules['min_length']:
  54. col_errors.append(f"Column '{col}' must be at least {rules['min_length']} characters")
  55. df.loc[index, 'validation_error'] = True
  56. if 'max_length' in rules and len(value) > rules['max_length']:
  57. col_errors.append(f"Column '{col}' must be at most {rules['max_length']} characters")
  58. df.loc[index, 'validation_error'] = True
  59. if 'pattern' in rules and not re.match(rules['pattern'], value):
  60. col_errors.append(f"Column '{col}' must match pattern: {rules['pattern']}")
  61. df.loc[index, 'validation_error'] = True
  62. if col_errors:
  63. df.loc[index, 'validation_error'] = col_errors # Store all errors in a single cell
  64. if col_errors:
  65. print(f"Validation errors for column '{col}': {col_errors}")
  66. # Apply overrides
  67. if overrides:
  68. for index, override in overrides.items():
  69. row = df.loc[index]
  70. for col, value in override.items():
  71. if col in validation_rules:
  72. df.loc[index, col] = value
  73. return df
  74. import re #Import Regular expressions for string validation
  75. if __

Add your comment