import re
from urllib.parse import urlparse
import requests
def validate_url(url):
"""
Validates a URL and checks for basic safety.
"""
try:
result = urlparse(url)
if not all([result.scheme, result.netloc]):
return False, "Invalid URL format"
if result.scheme not in ["http", "https"]:
return False, "Unsupported scheme"
return True, None
except Exception as e:
return False, f"URL parsing error: {str(e)}"
def check_url_safety(url):
"""
Checks if a URL is potentially malicious based on heuristics.
"""
# Basic checks
if "http://example.com" in url:
return False, "URL points to example.com (safe)"
if "http://localhost" in url:
return False, "URL points to localhost (potentially safe, but check context)"
if "http://127.0.0.1" in url:
return False, "URL points to 127.0.0.1 (potentially safe, but check context)"
# Check for suspicious characters (e.g., excessive dots, unusual characters)
if re.search(r"[.]{5,}", url):
return False, "URL contains excessive dots"
if re.search(r"[^a-zA-Z0-9._~:\/-]", url):
return False, "URL contains potentially malicious characters"
return True, None
def flag_anomalous_urls(url_list, threshold=0.8):
"""
Flags anomalous URLs in a list based on validation and safety checks.
Args:
url_list: A list of URLs to check.
threshold: A threshold for the overall anomaly score. If the score
exceeds the threshold, the URL is flagged.
Returns:
A list of flagged URLs and their corresponding reasons.
"""
flagged_urls = []
total_urls = len(url_list)
for url in url_list:
is_valid, validation_error = validate_url(url)
if not is_valid:
flagged_urls.append({"url": url, "reason": validation_error})
continue # Skip further checks if URL is invalid
is_safe, safety_error = check_url_safety(url)
if not is_safe:
flagged_urls.append({"url": url, "reason": safety_error})
continue
# Add more sophisticated checks here (e.g., checking website content)
# Example: Check if the website exists and returns a valid HTTP status code
try:
response = requests.get(url, timeout=5)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
except requests.exceptions.RequestException as e:
flagged_urls.append({"url": url, "reason": f"Website unreachable: {str(e)}"})
continue
# Calculate an overall anomaly score (simple example)
total_anomalies = len(flagged_urls)
if total_urls > 0:
anomaly_score = total_anomalies / total_urls
else:
anomaly_score = 0.0
if anomaly_score > threshold:
print(f"Anomaly score ({anomaly_score:.2f}) exceeds threshold ({threshold}).")
print("Flagged URLs:")
for item in flagged_urls:
print(f" - {item['url']}: {item['reason']}")
return flagged_urls
if __name__ == '__main__':
# Example usage:
url_list = [
"https://www.google.com",
"http://example.com",
"http://malicious.example.com/evil.php",
"invalid-url",
"https://www.example.com/path with spaces",
"http://127.0.0.1",
"http://localhost:8000",
"http://example.com/a..b",
"https://verylongdomainnamewithmanydots.com",
]
flagged = flag_an
Add your comment