import pandas as pd
import numpy as np
def validate_dataset(file_path, expected_columns=None, data_type_checks=None, min_value_checks=None, max_value_checks=None, missing_value_threshold=0.1):
"""
Validates a dataset for common errors.
Args:
file_path (str): Path to the dataset file (CSV or Excel).
expected_columns (list, optional): List of expected column names. Defaults to None.
data_type_checks (dict, optional): Dictionary specifying data type checks for columns.
e.g., {'column_name': 'int64', 'another_column': 'float64'}
min_value_checks (dict, optional): Dictionary specifying minimum value checks for columns.
e.g., {'column_name': 0}
max_value_checks (dict, optional): Dictionary specifying maximum value checks for columns.
e.g., {'column_name': 100}
missing_value_threshold (float, optional): Acceptable percentage of missing values. Defaults to 0.1.
Returns:
dict: A dictionary containing a list of validation errors. Returns an empty dict if no errors found.
"""
errors = []
try:
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
df = pd.read_excel(file_path)
else:
errors.append(f"Error: Unsupported file format: {file_path}")
return errors # Exit early if file type is not supported
except FileNotFoundError:
errors.append(f"Error: File not found: {file_path}")
return errors
except Exception as e:
errors.append(f"Error: Could not read file: {e}")
return errors
# Check for missing columns
if expected_columns:
missing_cols = [col for col in expected_columns if col not in df.columns]
if missing_cols:
errors.append(f"Missing columns: {missing_cols}")
# Check for data types
if data_type_checks:
for col, expected_type in data_type_checks.items():
if col not in df.columns:
errors.append(f"Error: Column '{col}' not found in dataset.")
continue # Skip to the next check
try:
df[col] = df[col].astype(expected_type)
except ValueError:
errors.append(f"Error: Column '{col}' has incorrect data type. Expected {expected_type}, got {df[col].dtype}")
except Exception as e:
errors.append(f"Error: Could not convert column '{col}' to type {expected_type}: {e}")
# Check for min/max values
if min_value_checks:
for col, min_val in min_value_checks.items():
if col not in df.columns:
errors.append(f"Error: Column '{col}' not found in dataset.")
continue
if (df[col] < min_val).any():
errors.append(f"Error: Column '{col}' contains values below the minimum of {min_val}")
if max_value_checks:
for col, max_val in max_value_checks.items():
if col not in df.columns:
errors.append(f"Error: Column '{col}' not found in dataset.")
continue
if (df[col] > max_val).any():
errors.append(f"Error: Column '{col}' contains values above the maximum of {max_val}")
# Check for missing values
total_rows = len(df)
missing_count = df.isnull().sum().sum()
missing_percentage = (missing_count / total_rows) if total_rows > 0 else 0
if missing_percentage > missing_value_threshold:
errors.append(f"Warning: Missing values exceed threshold ({missing_percentage:.2%}). Total missing values: {missing_count}")
return errors
Add your comment