백엔드기술/개발언어

자바 HTML 태그 제거 정규식

RevFactory 2011. 11. 22. 17:41

private String getText(String content) {
 Pattern SCRIPTS = Pattern.compile("<(no)?script[^>]*>.*?</(no)?script>",Pattern.DOTALL);
 Pattern STYLE = Pattern.compile("<style[^>]*>.*</style>",Pattern.DOTALL);
 Pattern TAGS = Pattern.compile("<(\"[^\"]*\"|\'[^\']*\'|[^\'\">])*>");
 Pattern nTAGS = Pattern.compile("<\\w+\\s+[^<]*\\s*>");
 Pattern ENTITY_REFS = Pattern.compile("&[^;]+;");
 Pattern WHITESPACE = Pattern.compile("\\s\\s+");
 
 Matcher m;
 
 m = SCRIPTS.matcher(content);
 content = m.replaceAll("");
 m = STYLE.matcher(content);
 content = m.replaceAll("");
 m = TAGS.matcher(content);
 content = m.replaceAll("");
 m = ENTITY_REFS.matcher(content);
 content = m.replaceAll("");
 m = WHITESPACE.matcher(content);
 content = m.replaceAll(" ");   
 
 return content;
}