/********************************************************************************************************* * 프로그램명 : TagFilter.java 프로그램설명 : 프로젝트와 관련된 정보를 얻을수 있는 class 작성자 : 강원중 작성일 : 2004.01.06 변경일 : 2003.11.30 **********************************************************************************************************/ package kr.co.kihyun.text.html; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import net.htmlparser.jericho.Attribute; import net.htmlparser.jericho.CharacterReference; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.EndTagType; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.HTMLElements; import net.htmlparser.jericho.OutputDocument; import net.htmlparser.jericho.Segment; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import net.htmlparser.jericho.Tag; import org.w3c.dom.Document; import org.w3c.tidy.Tidy; public class Html extends Tidy { /** * */ private static final long serialVersionUID = 1L; public static final String width = "hh"; public static final String height = "height"; public static final String tdWidth = ""; public static final String tdHeight = "25"; //50.public static을 통한 선언 오류(CWE-500) : Delete by YOUNGJUN,CHO //public static String tdOption = " "; //------------------------------------------------ // public static String titleBGColor = "#E0E0E0"; public static final String titleBGColor = "#f3f6fc"; public static final String inputBGColor = "#FFFFFF"; public static final String titleFontColor = "#404040"; public static final String strFontColor = "#4F484F"; public static final String intFontColor = "#4F484F"; public static final String dynaTotLineFontColor = "red"; // 동적표의 마지막 한 라인 집계 // list of HTML elements that will be retained in the final output: private static final Set VALID_ELEMENT_NAMES = new HashSet(Arrays.asList(new String[] { HTMLElementName.A, HTMLElementName.ABBR, HTMLElementName.ACRONYM, HTMLElementName.ADDRESS, HTMLElementName.APPLET, HTMLElementName.AREA, HTMLElementName.B, HTMLElementName.BASE, HTMLElementName.BASEFONT, HTMLElementName.BDO, HTMLElementName.BIG, HTMLElementName.BLOCKQUOTE, HTMLElementName.BODY, HTMLElementName.BR, HTMLElementName.BUTTON, HTMLElementName.CAPTION, HTMLElementName.CENTER, HTMLElementName.CITE, HTMLElementName.CODE, HTMLElementName.COL, HTMLElementName.COLGROUP, HTMLElementName.DD, HTMLElementName.DEL, HTMLElementName.DFN, HTMLElementName.DIR, HTMLElementName.DIV, HTMLElementName.DL, HTMLElementName.DT, HTMLElementName.EM, HTMLElementName.FIELDSET, HTMLElementName.FONT, HTMLElementName.FORM, HTMLElementName.FRAME, HTMLElementName.FRAMESET, HTMLElementName.H1, HTMLElementName.H2, HTMLElementName.H3, HTMLElementName.H4, HTMLElementName.H5, HTMLElementName.H6, HTMLElementName.HEAD, HTMLElementName.HR, HTMLElementName.HTML, HTMLElementName.I, HTMLElementName.IFRAME, HTMLElementName.IMG, HTMLElementName.INPUT, HTMLElementName.INS, HTMLElementName.ISINDEX, HTMLElementName.KBD, HTMLElementName.LABEL, HTMLElementName.LEGEND, HTMLElementName.LI, HTMLElementName.LINK, HTMLElementName.MAP, HTMLElementName.MENU, HTMLElementName.META, HTMLElementName.NOFRAMES, HTMLElementName.NOSCRIPT, HTMLElementName.OBJECT, HTMLElementName.OL, HTMLElementName.OPTGROUP, HTMLElementName.OPTION, HTMLElementName.P, HTMLElementName.PARAM, HTMLElementName.PRE, HTMLElementName.Q, HTMLElementName.S, HTMLElementName.SAMP, HTMLElementName.SCRIPT, HTMLElementName.SELECT, HTMLElementName.SMALL, HTMLElementName.SPAN, HTMLElementName.STRIKE, HTMLElementName.STRONG, HTMLElementName.STYLE, HTMLElementName.SUB, HTMLElementName.SUP, HTMLElementName.TABLE, HTMLElementName.TBODY, HTMLElementName.TD, HTMLElementName.TEXTAREA, HTMLElementName.TFOOT, HTMLElementName.TH, HTMLElementName.THEAD, HTMLElementName.TITLE, HTMLElementName.TR, HTMLElementName.TT, HTMLElementName.U, HTMLElementName.UL, HTMLElementName.VAR })); // list of HTML attributes that will be retained in the final output: private static final Set VALID_ATTRIBUTE_NAMES = new HashSet(Arrays.asList(new String[] { "id", "class", "hreflang", "title", "name", "rel", "rev", "type", "defer", "language", "xml:space", "style", "dir", "lang", "xml:lang", "xmlns", "ismap", "charset", "coords", "target", "href", "nohref", "size", "color", "face", "cite", "link", "alink", "vlink", "media", "shape", "height", "width", "align", "valign", "alt", "start", "label", "multiple", "archive", "code", "codebase", "codetype", "classid", "data", "declare", "standby", "hspace", "object", "vspace", "background", "bgcolor", "text", "disabled", "value", "valuetype", "char", "charoff", "span", "datetime", "compact", "profile", "noshade", "action", "accept", "accept-charset", "enctype", "method", "border", "ismap", "usemap", "frameborder", "longdesc", "marginheight", "marginwidth", "scrolling", "src", "checked", "maxlength", "readonly", "prompt", "for", "content", "http-equiv", "scheme", "cellpadding", "cellspacing", "frame", "summary", "abbr", "axis", "colspan", "headers", "nowrap", "rowspan", "scope", "cols", "rows", "accesskey", "tabindex", "onload", "onunload", "onblur", "onchange", "onfocus", "onreset", "onselect", "onsubmit", "onabort", "onkeydown", "onkeypress", "onkeyup", "onclick", "ondbclick", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup" })); // list of HTML properties that will be retained in the final output: private static final Set VALID_PROPERTY_NAMES = new HashSet(Arrays.asList(new String[] { "font-family", "font-family", "font-variant", "font-weight", "font-size", "font", "color", "background-color", "background-image", "background-repeat", "background-attachment", "background-position", "background", "word-spacing", "letter-spacing", "text-decoration", "vertical-align", "text-transform", "text-align", "text-indent", "line-height", "margin-top", "margin-right", "margin-bottom", "margin-left", "margin", "padding-top", "padding-right", "padding-bottom", "padding-left", "padding", "border-top-width", "border-right-width", "border-bottom-width", "border-left-width", "border-width", "border-color", "border-style", "border-top", "border-right", "border-bottom", "border-left", "border", "width", "height", "float", "float", "display", "white-space", "list-style-type", "list-style-image", "list-style-position", "list-style" })); private static final Object VALID_MARKER = new Object(); // public static final String DOC_TYPE = "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" \""+Moumi.getDtdUri()+"\""; public static final String DOC_TYPE = "\"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://localhost:7001/dtd/loose.dtd\""; private net.htmlparser.jericho.Source source = null; private OutputDocument outputDocument = null; public Html(InputStream in) { this(new InputStreamReader(in)); } public Html(String in) { this(new StringReader(in)); } public Html(Reader reader) { super(); try { setTidyMark(false); setDocType(DOC_TYPE); setDropEmptyParas(true); setDropProprietaryAttributes(true); setHideComments(true); source = new net.htmlparser.jericho.Source(reader); source.fullSequentialParse(); outputDocument = new OutputDocument(source); sanitise(false, true); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /* * override setDocType, dtd까지 auto로 가능하도록. */ public Document parseDOM() { StringWriter out = new StringWriter(); Document doc = parseDOM(new StringReader(outputDocument.toString()), out); out.getBuffer().toString(); return doc; } public void encodeInvalidMarkup() { encodeInvalidMarkup(false); } public void encodeInvalidMarkup(boolean formatWhiteSpace) { sanitise(formatWhiteSpace, false); } public void stripInvalidMarkup() { stripInvalidMarkup(false); } public void stripInvalidMarkup(boolean formatWhiteSpace) { sanitise(formatWhiteSpace, true); } // TODO: implement /* * public void toPDF(OutputStream out) { toPDF(out, defaultStyleSheet); } * * public void toPDF(OutputStream out, File styleSheet) { out = new BufferedOutputStream(out); try { Fop fop = * fopFactory.newFop(MimeConstants.MIME_PDF, foUserAgent, out); TransformerFactory factory = * TransformerFactory.newInstance(); Transformer transformer = factory.newTransformer(new StreamSource(styleSheet)); * transformer.setParameter("versionParam", "2.0"); LOG.debug("EXPORTING FROM:\n{}", domBuf); Source src = new * StreamSource(new StringReader(domBuf)); // Result res = new SAXResult(fop.getDefaultHandler()); Result res = new * StreamResult(out); transformer.transform(src, res); out.close(); } catch (FOPException e) { e.printStackTrace(); * } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerException e) { * e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } */ private void sanitise(boolean formatWhiteSpace, boolean stripInvalidElements) { List tags = source.getAllTags(); int pos = 0; for (Tag tag : tags) { if (processTag(tag, outputDocument)) { tag.setUserData(VALID_MARKER); } else { if (!stripInvalidElements) continue; // element will be encoded along with surrounding text outputDocument.remove(tag); } reencodeTextSegment(source, outputDocument, pos, tag.getBegin(), formatWhiteSpace); pos = tag.getEnd(); } reencodeTextSegment(source, outputDocument, pos, source.getEnd(), formatWhiteSpace); } private static boolean processTag(Tag tag, OutputDocument outputDocument) { String elementName = tag.getName(); if (!VALID_ELEMENT_NAMES.contains(elementName)) return false; if (tag.getTagType() == StartTagType.NORMAL) { Element element = tag.getElement(); if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) { if (element.getEndTag() == null) return false; // refect start tag if its required end tag is missing } else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) { if (elementName == HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags if (element.getEndTag() == null) outputDocument.insert(element.getEnd(), getEndTagHTML(elementName)); // insert optional end tag if // it is missing } outputDocument.replace(tag, getStartTagHTML(element.getStartTag())); } else if (tag.getTagType() == EndTagType.NORMAL) { if (tag.getElement() == null) return false; // reject end tags that aren't associated with a start tag if (elementName == HTMLElementName.LI && !isValidLITag(tag)) return false; // reject invalid LI tags outputDocument.replace(tag, getEndTagHTML(elementName)); } else { return false; // reject abnormal tags } return true; } private static boolean isValidLITag(Tag tag) { Element parentElement = tag.getElement().getParentElement(); if (parentElement == null) return false; // ignore LI elements without a parent if (parentElement.getStartTag().getUserData() != VALID_MARKER) return false; // ignore LI elements who's parent is not valid return parentElement.getName() == HTMLElementName.UL || parentElement.getName() == HTMLElementName.OL; // only // accept // LI // tags // who's // immediate // parent // is UL // or // OL. } private static void reencodeTextSegment(net.htmlparser.jericho.Source source, OutputDocument outputDocument, int begin, int end, boolean formatWhiteSpace) { if (begin >= end) return; Segment textSegment = new Segment(source, begin, end); String decodedText = CharacterReference.decode(textSegment); String encodedText = formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText) : CharacterReference.encode(decodedText); outputDocument.replace(textSegment, encodedText); } private static CharSequence getStartTagHTML(StartTag startTag) { // tidies and filters out non-approved attributes StringBuilder sb = new StringBuilder(); sb.append('<').append(startTag.getName()); for (Attribute attribute : startTag.getAttributes()) { if (VALID_ATTRIBUTE_NAMES.contains(attribute.getKey())) { sb.append(' ').append(attribute.getName()); if (attribute.getValue() != null) { sb.append("=\""); if ("style".equals(attribute.getKey())) { String[] properties = attribute.getValue().split(";"); for (String property : properties) { String name = property.split(":")[0].trim(); if (VALID_PROPERTY_NAMES.contains(name)) { sb.append(CharacterReference.encode(property)); sb.append("; "); } } } else { sb.append(CharacterReference.encode(attribute.getValue())); } sb.append('"'); } } } if (startTag.getElement().getEndTag() == null && !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName())) sb.append(" /"); sb.append('>'); return sb; } private static String getEndTagHTML(String tagName) { return "'; } }