import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LogTokenizer {
public static List<String> tokenize(String logEntry) {
List<String> tokens = new ArrayList<>();
// Define regular expression patterns for different token types
Pattern pattern1 = Pattern.compile("\\w+"); // Matches words (alphanumeric characters)
Pattern pattern2 = Pattern.compile("\\d+"); // Matches numbers
Pattern pattern3 = Pattern.compile("\\[\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\+\\d{4}\\]"); // Matches timestamp format
Pattern pattern4 = Pattern.compile("\\w+=\\w+"); // Matches key-value pairs
// Split the log entry into tokens using the defined patterns
Matcher matcher1 = pattern1.matcher(logEntry);
while (matcher1.find()) {
tokens.add(matcher1.group());
}
Matcher matcher2 = pattern2.matcher(logEntry);
while (matcher2.find()) {
tokens.add(matcher2.group());
}
Matcher matcher3 = pattern3.matcher(logEntry);
while (matcher3.find()) {
tokens.add(matcher3.group());
}
Matcher matcher4 = pattern4.matcher(logEntry);
while (matcher4.find()) {
tokens.add(matcher4.group());
}
return tokens;
}
public static void main(String[] args) {
String logEntry = "2023-10-27 10:00:00.123+0000 [INFO] User logged in. user=john.doe, role=admin";
List<String> tokens = tokenize(logEntry);
System.out.println(tokens);
}
}
Add your comment