1. import re
  2. import os
  3. import logging
  4. import time
  5. import functools
  6. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  7. def tokenize_file_paths(file_paths, max_retries=3, retry_delay=1):
  8. """
  9. Tokenizes a list of file paths, handling potential errors with retry logic.
  10. Args:
  11. file_paths (list): A list of file paths to tokenize.
  12. max_retries (int): The maximum number of retry attempts for each file path.
  13. retry_delay (int): The delay (in seconds) between retry attempts.
  14. Returns:
  15. list: A list of tokens, or None if tokenization fails after all retries.
  16. """
  17. def retry(func):
  18. @functools.wraps(func)
  19. def wrapper(*args, **kwargs):
  20. for attempt in range(max_retries):
  21. try:
  22. return func(*args, **kwargs)
  23. except Exception as e:
  24. logging.warning(f"Attempt {attempt + 1} failed: {e}")
  25. if attempt < max_retries - 1:
  26. time.sleep(retry_delay)
  27. else:
  28. logging.error(f"Failed to tokenize {args[0]} after {max_retries} attempts.")
  29. raise
  30. return wrapper
  31. def _tokenize_path(path):
  32. """
  33. Tokenizes a single file path.
  34. """
  35. # Basic validation: check if the path exists
  36. if not os.path.exists(path):
  37. raise ValueError(f"File path does not exist: {path}")
  38. # Remove leading/trailing whitespace
  39. path = path.strip()
  40. # Split path into components (directories and filename)
  41. parts = path.split(os.sep)
  42. # Sanitize the components (remove invalid characters)
  43. tokens = []
  44. for part in parts:
  45. # Only allow alphanumeric characters, underscores, and hyphens
  46. sanitized_part = re.sub(r'[^a-zA-Z0-9_\-]+', '', part)
  47. if sanitized_part: #Avoid adding empty strings
  48. tokens.append(sanitized_part)
  49. return tokens
  50. tokens = []
  51. for path in file_paths:
  52. try:
  53. tokens.append(_tokenize_path(path))
  54. except Exception as e:
  55. logging.error(f"Error tokenizing path '{path}': {e}")
  56. tokens.append(None) # Indicate failure for this path
  57. return tokens

Add your comment