1. import java.util.ArrayList;
  2. import java.util.List;
  3. import java.util.regex.Matcher;
  4. import java.util.regex.Pattern;
  5. public class DataCleaner {
  6. public static List<String> cleanData(List<String> data) {
  7. List<String> cleanedData = new ArrayList<>();
  8. for (String item : data) {
  9. if (item != null && !item.trim().isEmpty()) {
  10. String cleanedItem = item;
  11. // Remove leading/trailing whitespace
  12. cleanedItem = cleanedItem.trim();
  13. // Replace multiple spaces with single space
  14. cleanedItem = Pattern.compile("\\s+").matcher(cleanedItem).replaceAll(" ");
  15. // Remove HTML tags
  16. cleanedItem = removeHtmlTags(cleanedItem);
  17. // Remove special characters (keep alphanumeric and spaces)
  18. cleanedItem = removeSpecialCharacters(cleanedItem);
  19. // Convert to lowercase (optional, depending on needs)
  20. cleanedItem = cleanedItem.toLowerCase();
  21. cleanedData.add(cleanedItem);
  22. }
  23. }
  24. return cleanedData;
  25. }
  26. private static String removeHtmlTags(String text) {
  27. Pattern pattern = Pattern.compile("<[^>]*>");
  28. Matcher matcher = pattern.matcher(text);
  29. return matcher.replaceAll("");
  30. }
  31. private static String removeSpecialCharacters(String text) {
  32. Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\s]"); //Keep alphanumeric and spaces
  33. Matcher matcher = pattern.matcher(text);
  34. return matcher.replaceAll("");
  35. }
  36. public static void main(String[] args) {
  37. //Example Usage
  38. List<String> dirtyData = new ArrayList<>();
  39. dirtyData.add(" This is a test! ");
  40. dirtyData.add("Some text with <HTML> tags.");
  41. dirtyData.add("Special chars: !@#$%^&*()");
  42. dirtyData.add("Another test with multiple spaces.");
  43. dirtyData.add(null);
  44. dirtyData.add("");
  45. List<String> cleanedData = cleanData(dirtyData);
  46. for (String item : cleanedData) {
  47. System.out.println(item);
  48. }
  49. }
  50. }

Add your comment