1. import java.util.ArrayList;
  2. import java.util.List;
  3. import java.util.regex.Matcher;
  4. import java.util.regex.Pattern;
  5. public class LogTokenizer {
  6. public static List<String> tokenize(String logEntry) {
  7. List<String> tokens = new ArrayList<>();
  8. // Define regular expression patterns for different token types
  9. Pattern pattern1 = Pattern.compile("\\w+"); // Matches words (alphanumeric characters)
  10. Pattern pattern2 = Pattern.compile("\\d+"); // Matches numbers
  11. Pattern pattern3 = Pattern.compile("\\[\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\+\\d{4}\\]"); // Matches timestamp format
  12. Pattern pattern4 = Pattern.compile("\\w+=\\w+"); // Matches key-value pairs
  13. // Split the log entry into tokens using the defined patterns
  14. Matcher matcher1 = pattern1.matcher(logEntry);
  15. while (matcher1.find()) {
  16. tokens.add(matcher1.group());
  17. }
  18. Matcher matcher2 = pattern2.matcher(logEntry);
  19. while (matcher2.find()) {
  20. tokens.add(matcher2.group());
  21. }
  22. Matcher matcher3 = pattern3.matcher(logEntry);
  23. while (matcher3.find()) {
  24. tokens.add(matcher3.group());
  25. }
  26. Matcher matcher4 = pattern4.matcher(logEntry);
  27. while (matcher4.find()) {
  28. tokens.add(matcher4.group());
  29. }
  30. return tokens;
  31. }
  32. public static void main(String[] args) {
  33. String logEntry = "2023-10-27 10:00:00.123+0000 [INFO] User logged in. user=john.doe, role=admin";
  34. List<String> tokens = tokenize(logEntry);
  35. System.out.println(tokens);
  36. }
  37. }

Add your comment