import pandas as pd
import numpy as np
def collect_metrics(data_generator, metrics=['size', 'dtype_counts', 'null_counts', 'unique_counts', 'min_max']):
"""
Collects metrics of a dataset iteratively to minimize memory usage.
Args:
data_generator: An object that yields batches of data (e.g., a custom iterator).
metrics: A list of metrics to calculate.
Returns:
A dictionary containing the collected metrics.
"""
total_size = 0
dtype_counts = {}
null_counts = {}
unique_counts = {}
data_list = [] # Store data batches
for batch in data_generator:
if isinstance(batch, pd.DataFrame):
data_list.append(batch)
elif isinstance(batch, np.ndarray):
data_list.append(batch)
else:
raise TypeError("Batch must be a pandas DataFrame or numpy array.")
if not data_list:
return {
'size': 0,
'dtype_counts': {},
'null_counts': {},
'unique_counts': {},
'min_max': {}
}
combined_data = pd.concat(data_list, ignore_index=True)
total_size = combined_data.size
# Calculate dtype counts
for col in combined_data.columns:
dtype_counts[col] = combined_data[col].dtype
# Calculate null counts
for col in combined_data.columns:
null_counts[col] = combined_data[col].isnull().sum()
# Calculate unique counts
for col in combined_data.columns:
unique_counts[col] = combined_data[col].nunique()
# Calculate min/max for numeric columns
for col in combined_data.select_dtypes(include=np.number).columns:
min_val = combined_data[col].min()
max_val = combined_data[col].max()
data_list_col = []
for batch in data_list:
if isinstance(batch, pd.DataFrame):
data_list_col.append(batch[col])
elif isinstance(batch, np.ndarray):
data_list_col.append(batch[:,col])
min_val = np.min(np.concatenate(data_list_col))
max_val = np.max(np.concatenate(data_list_col))
data_list.remove(batch)
min_val = combined_data[col].min()
max_val = combined_data[col].max()
data_list.append(combined_data[col])
return {
'size': total_size,
'dtype_counts': dtype_counts,
'null_counts': null_counts,
'unique_counts': unique_counts,
'min_max': {col: (min_val, max_val) for col in combined_data.select_dtypes(include=np.number).columns}
}
if __name__ == '__main__':
# Example Usage:
def dummy_data_generator(num_batches=3):
for i in range(num_batches):
data = {'col1': np.random.rand(100), 'col2': np.random.randint(0, 10, 50), 'col3': ['a', 'b'] * 25}
yield pd.DataFrame(data)
metrics = collect_metrics(dummy_data_generator())
print(metrics)
Add your comment