import pandas as pd
import requests
import time
import logging
logging.basicConfig(level=logging.INFO)
def load_dataset(url, max_retries=3, retry_delay=5):
"""
Loads a dataset from a URL with retry logic and fallback.
Args:
url (str): The URL of the dataset.
max_retries (int): The maximum number of retries.
retry_delay (int): The delay in seconds between retries.
Returns:
pandas.DataFrame: The loaded DataFrame, or None if loading fails after all retries.
"""
for attempt in range(max_retries):
try:
# Attempt to load the dataset
df = pd.read_csv(url) # Adjust function based on data format (e.g., read_excel, etc.)
logging.info(f"Successfully loaded dataset from {url} on attempt {attempt + 1}")
return df
except requests.exceptions.RequestException as e:
logging.warning(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
logging.info(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
logging.error(f"Failed to load dataset from {url} after {max_retries} attempts.")
return None
except pd.errors.ParserError as e:
logging.warning(f"Attempt {attempt + 1} failed parsing the dataset: {e}")
if attempt < max_retries - 1:
logging.info(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
logging.error(f"Failed to load/parse dataset from {url} after {max_retries} attempts.")
return None
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return None
if __name__ == '__main__':
# Example Usage
url = "https://raw.githubusercontent.com/plotly/datasets/master/iris.csv" # Replace with your dataset URL
df = load_dataset(url)
if df is not None:
print("Dataset loaded successfully:")
print(df.head())
else:
print("Failed to load dataset.")
Add your comment