1. import pandas as pd
  2. import numpy as np
  3. def collect_metrics(data_generator, metrics=['size', 'dtype_counts', 'null_counts', 'unique_counts', 'min_max']):
  4. """
  5. Collects metrics of a dataset iteratively to minimize memory usage.
  6. Args:
  7. data_generator: An object that yields batches of data (e.g., a custom iterator).
  8. metrics: A list of metrics to calculate.
  9. Returns:
  10. A dictionary containing the collected metrics.
  11. """
  12. total_size = 0
  13. dtype_counts = {}
  14. null_counts = {}
  15. unique_counts = {}
  16. data_list = [] # Store data batches
  17. for batch in data_generator:
  18. if isinstance(batch, pd.DataFrame):
  19. data_list.append(batch)
  20. elif isinstance(batch, np.ndarray):
  21. data_list.append(batch)
  22. else:
  23. raise TypeError("Batch must be a pandas DataFrame or numpy array.")
  24. if not data_list:
  25. return {
  26. 'size': 0,
  27. 'dtype_counts': {},
  28. 'null_counts': {},
  29. 'unique_counts': {},
  30. 'min_max': {}
  31. }
  32. combined_data = pd.concat(data_list, ignore_index=True)
  33. total_size = combined_data.size
  34. # Calculate dtype counts
  35. for col in combined_data.columns:
  36. dtype_counts[col] = combined_data[col].dtype
  37. # Calculate null counts
  38. for col in combined_data.columns:
  39. null_counts[col] = combined_data[col].isnull().sum()
  40. # Calculate unique counts
  41. for col in combined_data.columns:
  42. unique_counts[col] = combined_data[col].nunique()
  43. # Calculate min/max for numeric columns
  44. for col in combined_data.select_dtypes(include=np.number).columns:
  45. min_val = combined_data[col].min()
  46. max_val = combined_data[col].max()
  47. data_list_col = []
  48. for batch in data_list:
  49. if isinstance(batch, pd.DataFrame):
  50. data_list_col.append(batch[col])
  51. elif isinstance(batch, np.ndarray):
  52. data_list_col.append(batch[:,col])
  53. min_val = np.min(np.concatenate(data_list_col))
  54. max_val = np.max(np.concatenate(data_list_col))
  55. data_list.remove(batch)
  56. min_val = combined_data[col].min()
  57. max_val = combined_data[col].max()
  58. data_list.append(combined_data[col])
  59. return {
  60. 'size': total_size,
  61. 'dtype_counts': dtype_counts,
  62. 'null_counts': null_counts,
  63. 'unique_counts': unique_counts,
  64. 'min_max': {col: (min_val, max_val) for col in combined_data.select_dtypes(include=np.number).columns}
  65. }
  66. if __name__ == '__main__':
  67. # Example Usage:
  68. def dummy_data_generator(num_batches=3):
  69. for i in range(num_batches):
  70. data = {'col1': np.random.rand(100), 'col2': np.random.randint(0, 10, 50), 'col3': ['a', 'b'] * 25}
  71. yield pd.DataFrame(data)
  72. metrics = collect_metrics(dummy_data_generator())
  73. print(metrics)

Add your comment