import re
import os
import logging
import time
import functools
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def tokenize_file_paths(file_paths, max_retries=3, retry_delay=1):
"""
Tokenizes a list of file paths, handling potential errors with retry logic.
Args:
file_paths (list): A list of file paths to tokenize.
max_retries (int): The maximum number of retry attempts for each file path.
retry_delay (int): The delay (in seconds) between retry attempts.
Returns:
list: A list of tokens, or None if tokenization fails after all retries.
"""
def retry(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
logging.warning(f"Attempt {attempt + 1} failed: {e}")
if attempt < max_retries - 1:
time.sleep(retry_delay)
else:
logging.error(f"Failed to tokenize {args[0]} after {max_retries} attempts.")
raise
return wrapper
def _tokenize_path(path):
"""
Tokenizes a single file path.
"""
# Basic validation: check if the path exists
if not os.path.exists(path):
raise ValueError(f"File path does not exist: {path}")
# Remove leading/trailing whitespace
path = path.strip()
# Split path into components (directories and filename)
parts = path.split(os.sep)
# Sanitize the components (remove invalid characters)
tokens = []
for part in parts:
# Only allow alphanumeric characters, underscores, and hyphens
sanitized_part = re.sub(r'[^a-zA-Z0-9_\-]+', '', part)
if sanitized_part: #Avoid adding empty strings
tokens.append(sanitized_part)
return tokens
tokens = []
for path in file_paths:
try:
tokens.append(_tokenize_path(path))
except Exception as e:
logging.error(f"Error tokenizing path '{path}': {e}")
tokens.append(None) # Indicate failure for this path
return tokens
Add your comment