1. import pandas as pd
  2. import numpy as np
  3. def validate_dataset(file_path, expected_columns=None, data_type_checks=None, min_value_checks=None, max_value_checks=None, missing_value_threshold=0.1):
  4. """
  5. Validates a dataset for common errors.
  6. Args:
  7. file_path (str): Path to the dataset file (CSV or Excel).
  8. expected_columns (list, optional): List of expected column names. Defaults to None.
  9. data_type_checks (dict, optional): Dictionary specifying data type checks for columns.
  10. e.g., {'column_name': 'int64', 'another_column': 'float64'}
  11. min_value_checks (dict, optional): Dictionary specifying minimum value checks for columns.
  12. e.g., {'column_name': 0}
  13. max_value_checks (dict, optional): Dictionary specifying maximum value checks for columns.
  14. e.g., {'column_name': 100}
  15. missing_value_threshold (float, optional): Acceptable percentage of missing values. Defaults to 0.1.
  16. Returns:
  17. dict: A dictionary containing a list of validation errors. Returns an empty dict if no errors found.
  18. """
  19. errors = []
  20. try:
  21. if file_path.endswith('.csv'):
  22. df = pd.read_csv(file_path)
  23. elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
  24. df = pd.read_excel(file_path)
  25. else:
  26. errors.append(f"Error: Unsupported file format: {file_path}")
  27. return errors # Exit early if file type is not supported
  28. except FileNotFoundError:
  29. errors.append(f"Error: File not found: {file_path}")
  30. return errors
  31. except Exception as e:
  32. errors.append(f"Error: Could not read file: {e}")
  33. return errors
  34. # Check for missing columns
  35. if expected_columns:
  36. missing_cols = [col for col in expected_columns if col not in df.columns]
  37. if missing_cols:
  38. errors.append(f"Missing columns: {missing_cols}")
  39. # Check for data types
  40. if data_type_checks:
  41. for col, expected_type in data_type_checks.items():
  42. if col not in df.columns:
  43. errors.append(f"Error: Column '{col}' not found in dataset.")
  44. continue # Skip to the next check
  45. try:
  46. df[col] = df[col].astype(expected_type)
  47. except ValueError:
  48. errors.append(f"Error: Column '{col}' has incorrect data type. Expected {expected_type}, got {df[col].dtype}")
  49. except Exception as e:
  50. errors.append(f"Error: Could not convert column '{col}' to type {expected_type}: {e}")
  51. # Check for min/max values
  52. if min_value_checks:
  53. for col, min_val in min_value_checks.items():
  54. if col not in df.columns:
  55. errors.append(f"Error: Column '{col}' not found in dataset.")
  56. continue
  57. if (df[col] < min_val).any():
  58. errors.append(f"Error: Column '{col}' contains values below the minimum of {min_val}")
  59. if max_value_checks:
  60. for col, max_val in max_value_checks.items():
  61. if col not in df.columns:
  62. errors.append(f"Error: Column '{col}' not found in dataset.")
  63. continue
  64. if (df[col] > max_val).any():
  65. errors.append(f"Error: Column '{col}' contains values above the maximum of {max_val}")
  66. # Check for missing values
  67. total_rows = len(df)
  68. missing_count = df.isnull().sum().sum()
  69. missing_percentage = (missing_count / total_rows) if total_rows > 0 else 0
  70. if missing_percentage > missing_value_threshold:
  71. errors.append(f"Warning: Missing values exceed threshold ({missing_percentage:.2%}). Total missing values: {missing_count}")
  72. return errors

Add your comment