import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DataCleaner {
public static List<String> cleanData(List<String> data) {
List<String> cleanedData = new ArrayList<>();
for (String item : data) {
if (item != null && !item.trim().isEmpty()) {
String cleanedItem = item;
// Remove leading/trailing whitespace
cleanedItem = cleanedItem.trim();
// Replace multiple spaces with single space
cleanedItem = Pattern.compile("\\s+").matcher(cleanedItem).replaceAll(" ");
// Remove HTML tags
cleanedItem = removeHtmlTags(cleanedItem);
// Remove special characters (keep alphanumeric and spaces)
cleanedItem = removeSpecialCharacters(cleanedItem);
// Convert to lowercase (optional, depending on needs)
cleanedItem = cleanedItem.toLowerCase();
cleanedData.add(cleanedItem);
}
}
return cleanedData;
}
private static String removeHtmlTags(String text) {
Pattern pattern = Pattern.compile("<[^>]*>");
Matcher matcher = pattern.matcher(text);
return matcher.replaceAll("");
}
private static String removeSpecialCharacters(String text) {
Pattern pattern = Pattern.compile("[^a-zA-Z0-9\\s]"); //Keep alphanumeric and spaces
Matcher matcher = pattern.matcher(text);
return matcher.replaceAll("");
}
public static void main(String[] args) {
//Example Usage
List<String> dirtyData = new ArrayList<>();
dirtyData.add(" This is a test! ");
dirtyData.add("Some text with <HTML> tags.");
dirtyData.add("Special chars: !@#$%^&*()");
dirtyData.add("Another test with multiple spaces.");
dirtyData.add(null);
dirtyData.add("");
List<String> cleanedData = cleanData(dirtyData);
for (String item : cleanedData) {
System.out.println(item);
}
}
}
Add your comment