1. import re
  2. from urllib.parse import urlparse
  3. import requests
  4. def validate_url(url):
  5. """
  6. Validates a URL and checks for basic safety.
  7. """
  8. try:
  9. result = urlparse(url)
  10. if not all([result.scheme, result.netloc]):
  11. return False, "Invalid URL format"
  12. if result.scheme not in ["http", "https"]:
  13. return False, "Unsupported scheme"
  14. return True, None
  15. except Exception as e:
  16. return False, f"URL parsing error: {str(e)}"
  17. def check_url_safety(url):
  18. """
  19. Checks if a URL is potentially malicious based on heuristics.
  20. """
  21. # Basic checks
  22. if "http://example.com" in url:
  23. return False, "URL points to example.com (safe)"
  24. if "http://localhost" in url:
  25. return False, "URL points to localhost (potentially safe, but check context)"
  26. if "http://127.0.0.1" in url:
  27. return False, "URL points to 127.0.0.1 (potentially safe, but check context)"
  28. # Check for suspicious characters (e.g., excessive dots, unusual characters)
  29. if re.search(r"[.]{5,}", url):
  30. return False, "URL contains excessive dots"
  31. if re.search(r"[^a-zA-Z0-9._~:\/-]", url):
  32. return False, "URL contains potentially malicious characters"
  33. return True, None
  34. def flag_anomalous_urls(url_list, threshold=0.8):
  35. """
  36. Flags anomalous URLs in a list based on validation and safety checks.
  37. Args:
  38. url_list: A list of URLs to check.
  39. threshold: A threshold for the overall anomaly score. If the score
  40. exceeds the threshold, the URL is flagged.
  41. Returns:
  42. A list of flagged URLs and their corresponding reasons.
  43. """
  44. flagged_urls = []
  45. total_urls = len(url_list)
  46. for url in url_list:
  47. is_valid, validation_error = validate_url(url)
  48. if not is_valid:
  49. flagged_urls.append({"url": url, "reason": validation_error})
  50. continue # Skip further checks if URL is invalid
  51. is_safe, safety_error = check_url_safety(url)
  52. if not is_safe:
  53. flagged_urls.append({"url": url, "reason": safety_error})
  54. continue
  55. # Add more sophisticated checks here (e.g., checking website content)
  56. # Example: Check if the website exists and returns a valid HTTP status code
  57. try:
  58. response = requests.get(url, timeout=5)
  59. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  60. except requests.exceptions.RequestException as e:
  61. flagged_urls.append({"url": url, "reason": f"Website unreachable: {str(e)}"})
  62. continue
  63. # Calculate an overall anomaly score (simple example)
  64. total_anomalies = len(flagged_urls)
  65. if total_urls > 0:
  66. anomaly_score = total_anomalies / total_urls
  67. else:
  68. anomaly_score = 0.0
  69. if anomaly_score > threshold:
  70. print(f"Anomaly score ({anomaly_score:.2f}) exceeds threshold ({threshold}).")
  71. print("Flagged URLs:")
  72. for item in flagged_urls:
  73. print(f" - {item['url']}: {item['reason']}")
  74. return flagged_urls
  75. if __name__ == '__main__':
  76. # Example usage:
  77. url_list = [
  78. "https://www.google.com",
  79. "http://example.com",
  80. "http://malicious.example.com/evil.php",
  81. "invalid-url",
  82. "https://www.example.com/path with spaces",
  83. "http://127.0.0.1",
  84. "http://localhost:8000",
  85. "http://example.com/a..b",
  86. "https://verylongdomainnamewithmanydots.com",
  87. ]
  88. flagged = flag_an

Add your comment