You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
294 lines
14 KiB
294 lines
14 KiB
/********************************************************************************************************* |
|
* 프로그램명 : TagFilter.java 프로그램설명 : 프로젝트와 관련된 정보를 얻을수 있는 class 작성자 : 강원중 작성일 : 2004.01.06 변경일 : 2003.11.30 |
|
**********************************************************************************************************/ |
|
|
|
package kr.co.kihyun.text.html; |
|
|
|
import java.io.IOException; |
|
import java.io.InputStream; |
|
import java.io.InputStreamReader; |
|
import java.io.Reader; |
|
import java.io.StringReader; |
|
import java.io.StringWriter; |
|
import java.util.Arrays; |
|
import java.util.HashSet; |
|
import java.util.List; |
|
import java.util.Set; |
|
|
|
import net.htmlparser.jericho.Attribute; |
|
import net.htmlparser.jericho.CharacterReference; |
|
import net.htmlparser.jericho.Element; |
|
import net.htmlparser.jericho.EndTagType; |
|
import net.htmlparser.jericho.HTMLElementName; |
|
import net.htmlparser.jericho.HTMLElements; |
|
import net.htmlparser.jericho.OutputDocument; |
|
import net.htmlparser.jericho.Segment; |
|
import net.htmlparser.jericho.StartTag; |
|
import net.htmlparser.jericho.StartTagType; |
|
import net.htmlparser.jericho.Tag; |
|
|
|
import org.w3c.dom.Document; |
|
import org.w3c.tidy.Tidy; |
|
|
|
public class Html extends Tidy { |
|
/** |
|
* |
|
*/ |
|
private static final long serialVersionUID = 1L; |
|
|
|
public static final String width = "hh"; |
|
public static final String height = "height"; |
|
public static final String tdWidth = ""; |
|
public static final String tdHeight = "25"; |
|
|
|
//50.public static을 통한 선언 오류(CWE-500) : Delete by YOUNGJUN,CHO |
|
//public static String tdOption = " "; |
|
//------------------------------------------------ |
|
|
|
// public static String titleBGColor = "#E0E0E0"; |
|
public static final String titleBGColor = "#f3f6fc"; |
|
public static final String inputBGColor = "#FFFFFF"; |
|
|
|
public static final String titleFontColor = "#404040"; |
|
public static final String strFontColor = "#4F484F"; |
|
public static final String intFontColor = "#4F484F"; |
|
public static final String dynaTotLineFontColor = "red"; // 동적표의 마지막 한 라인 집계 |
|
|
|
// list of HTML elements that will be retained in the final output: |
|
private static final Set<String> VALID_ELEMENT_NAMES = new HashSet<String>(Arrays.asList(new String[] { |
|
HTMLElementName.A, HTMLElementName.ABBR, HTMLElementName.ACRONYM, HTMLElementName.ADDRESS, |
|
HTMLElementName.APPLET, HTMLElementName.AREA, HTMLElementName.B, HTMLElementName.BASE, |
|
HTMLElementName.BASEFONT, HTMLElementName.BDO, HTMLElementName.BIG, HTMLElementName.BLOCKQUOTE, |
|
HTMLElementName.BODY, HTMLElementName.BR, HTMLElementName.BUTTON, HTMLElementName.CAPTION, |
|
HTMLElementName.CENTER, HTMLElementName.CITE, HTMLElementName.CODE, HTMLElementName.COL, |
|
HTMLElementName.COLGROUP, HTMLElementName.DD, HTMLElementName.DEL, HTMLElementName.DFN, |
|
HTMLElementName.DIR, HTMLElementName.DIV, HTMLElementName.DL, HTMLElementName.DT, HTMLElementName.EM, |
|
HTMLElementName.FIELDSET, HTMLElementName.FONT, HTMLElementName.FORM, HTMLElementName.FRAME, |
|
HTMLElementName.FRAMESET, HTMLElementName.H1, HTMLElementName.H2, HTMLElementName.H3, HTMLElementName.H4, |
|
HTMLElementName.H5, HTMLElementName.H6, HTMLElementName.HEAD, HTMLElementName.HR, HTMLElementName.HTML, |
|
HTMLElementName.I, HTMLElementName.IFRAME, HTMLElementName.IMG, HTMLElementName.INPUT, HTMLElementName.INS, |
|
HTMLElementName.ISINDEX, HTMLElementName.KBD, HTMLElementName.LABEL, HTMLElementName.LEGEND, |
|
HTMLElementName.LI, HTMLElementName.LINK, HTMLElementName.MAP, HTMLElementName.MENU, HTMLElementName.META, |
|
HTMLElementName.NOFRAMES, HTMLElementName.NOSCRIPT, HTMLElementName.OBJECT, HTMLElementName.OL, |
|
HTMLElementName.OPTGROUP, HTMLElementName.OPTION, HTMLElementName.P, HTMLElementName.PARAM, |
|
HTMLElementName.PRE, HTMLElementName.Q, HTMLElementName.S, HTMLElementName.SAMP, HTMLElementName.SCRIPT, |
|
HTMLElementName.SELECT, HTMLElementName.SMALL, HTMLElementName.SPAN, HTMLElementName.STRIKE, |
|
HTMLElementName.STRONG, HTMLElementName.STYLE, HTMLElementName.SUB, HTMLElementName.SUP, |
|
HTMLElementName.TABLE, HTMLElementName.TBODY, HTMLElementName.TD, HTMLElementName.TEXTAREA, |
|
HTMLElementName.TFOOT, HTMLElementName.TH, HTMLElementName.THEAD, HTMLElementName.TITLE, |
|
HTMLElementName.TR, HTMLElementName.TT, HTMLElementName.U, HTMLElementName.UL, HTMLElementName.VAR })); |
|
|
|
// list of HTML attributes that will be retained in the final output: |
|
private static final Set<String> VALID_ATTRIBUTE_NAMES = new HashSet<String>(Arrays.asList(new String[] { "id", |
|
"class", "hreflang", "title", "name", "rel", "rev", "type", "defer", "language", "xml:space", "style", |
|
"dir", "lang", "xml:lang", "xmlns", "ismap", "charset", "coords", "target", "href", "nohref", "size", |
|
"color", "face", "cite", "link", "alink", "vlink", "media", "shape", "height", "width", "align", "valign", |
|
"alt", "start", "label", "multiple", "archive", "code", "codebase", "codetype", "classid", "data", |
|
"declare", "standby", "hspace", "object", "vspace", "background", "bgcolor", "text", "disabled", "value", |
|
"valuetype", "char", "charoff", "span", "datetime", "compact", "profile", "noshade", "action", "accept", |
|
"accept-charset", "enctype", "method", "border", "ismap", "usemap", "frameborder", "longdesc", |
|
"marginheight", "marginwidth", "scrolling", "src", "checked", "maxlength", "readonly", "prompt", "for", |
|
"content", "http-equiv", "scheme", "cellpadding", "cellspacing", "frame", "summary", "abbr", "axis", |
|
"colspan", "headers", "nowrap", "rowspan", "scope", "cols", "rows", "accesskey", "tabindex", "onload", |
|
"onunload", "onblur", "onchange", "onfocus", "onreset", "onselect", "onsubmit", "onabort", "onkeydown", |
|
"onkeypress", "onkeyup", "onclick", "ondbclick", "onmousedown", "onmousemove", "onmouseout", "onmouseover", |
|
"onmouseup" })); |
|
|
|
// list of HTML properties that will be retained in the final output: |
|
private static final Set<String> VALID_PROPERTY_NAMES = new HashSet<String>(Arrays.asList(new String[] { |
|
"font-family", "font-family", "font-variant", "font-weight", "font-size", "font", "color", |
|
"background-color", "background-image", "background-repeat", "background-attachment", |
|
"background-position", "background", "word-spacing", "letter-spacing", "text-decoration", "vertical-align", |
|
"text-transform", "text-align", "text-indent", "line-height", "margin-top", "margin-right", |
|
"margin-bottom", "margin-left", "margin", "padding-top", "padding-right", "padding-bottom", "padding-left", |
|
"padding", "border-top-width", "border-right-width", "border-bottom-width", "border-left-width", |
|
"border-width", "border-color", "border-style", "border-top", "border-right", "border-bottom", |
|
"border-left", "border", "width", "height", "float", "float", "display", "white-space", "list-style-type", |
|
"list-style-image", "list-style-position", "list-style" })); |
|
|
|
private static final Object VALID_MARKER = new Object(); |
|
|
|
// public static final String DOC_TYPE = "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" \""+Moumi.getDtdUri()+"\""; |
|
public static final String DOC_TYPE = "\"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://localhost:7001/dtd/loose.dtd\""; |
|
private net.htmlparser.jericho.Source source = null; |
|
private OutputDocument outputDocument = null; |
|
|
|
public Html(InputStream in) { |
|
this(new InputStreamReader(in)); |
|
} |
|
|
|
public Html(String in) { |
|
this(new StringReader(in)); |
|
} |
|
|
|
public Html(Reader reader) { |
|
super(); |
|
try { |
|
setTidyMark(false); |
|
setDocType(DOC_TYPE); |
|
setDropEmptyParas(true); |
|
setDropProprietaryAttributes(true); |
|
setHideComments(true); |
|
source = new net.htmlparser.jericho.Source(reader); |
|
source.fullSequentialParse(); |
|
outputDocument = new OutputDocument(source); |
|
sanitise(false, true); |
|
} catch (IOException e) { |
|
// TODO Auto-generated catch block |
|
e.printStackTrace(); |
|
} |
|
} |
|
|
|
/* |
|
* override setDocType, dtd까지 auto로 가능하도록. |
|
*/ |
|
|
|
public Document parseDOM() { |
|
StringWriter out = new StringWriter(); |
|
Document doc = parseDOM(new StringReader(outputDocument.toString()), out); |
|
out.getBuffer().toString(); |
|
return doc; |
|
} |
|
|
|
public void encodeInvalidMarkup() { |
|
encodeInvalidMarkup(false); |
|
} |
|
|
|
public void encodeInvalidMarkup(boolean formatWhiteSpace) { |
|
sanitise(formatWhiteSpace, false); |
|
} |
|
|
|
public void stripInvalidMarkup() { |
|
stripInvalidMarkup(false); |
|
} |
|
|
|
public void stripInvalidMarkup(boolean formatWhiteSpace) { |
|
sanitise(formatWhiteSpace, true); |
|
} |
|
|
|
// TODO: implement |
|
/* |
|
* public void toPDF(OutputStream out) { toPDF(out, defaultStyleSheet); } |
|
* |
|
* public void toPDF(OutputStream out, File styleSheet) { out = new BufferedOutputStream(out); try { Fop fop = |
|
* fopFactory.newFop(MimeConstants.MIME_PDF, foUserAgent, out); TransformerFactory factory = |
|
* TransformerFactory.newInstance(); Transformer transformer = factory.newTransformer(new StreamSource(styleSheet)); |
|
* transformer.setParameter("versionParam", "2.0"); LOG.debug("EXPORTING FROM:\n{}", domBuf); Source src = new |
|
* StreamSource(new StringReader(domBuf)); // Result res = new SAXResult(fop.getDefaultHandler()); Result res = new |
|
* StreamResult(out); transformer.transform(src, res); out.close(); } catch (FOPException e) { e.printStackTrace(); |
|
* } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerException e) { |
|
* e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } |
|
*/ |
|
|
|
private void sanitise(boolean formatWhiteSpace, boolean stripInvalidElements) { |
|
List<Tag> tags = source.getAllTags(); |
|
int pos = 0; |
|
for (Tag tag : tags) { |
|
if (processTag(tag, outputDocument)) { |
|
tag.setUserData(VALID_MARKER); |
|
} else { |
|
if (!stripInvalidElements) |
|
continue; // element will be encoded along with surrounding text |
|
outputDocument.remove(tag); |
|
} |
|
reencodeTextSegment(source, outputDocument, pos, tag.getBegin(), formatWhiteSpace); |
|
pos = tag.getEnd(); |
|
} |
|
reencodeTextSegment(source, outputDocument, pos, source.getEnd(), formatWhiteSpace); |
|
} |
|
|
|
private static boolean processTag(Tag tag, OutputDocument outputDocument) { |
|
String elementName = tag.getName(); |
|
if (!VALID_ELEMENT_NAMES.contains(elementName)) |
|
return false; |
|
if (tag.getTagType() == StartTagType.NORMAL) { |
|
Element element = tag.getElement(); |
|
if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) { |
|
if (element.getEndTag() == null) |
|
return false; // refect start tag if its required end tag is missing |
|
} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) { |
|
if (elementName == HTMLElementName.LI && !isValidLITag(tag)) |
|
return false; // reject invalid LI tags |
|
if (element.getEndTag() == null) |
|
outputDocument.insert(element.getEnd(), getEndTagHTML(elementName)); // insert optional end tag if |
|
// it is missing |
|
} |
|
outputDocument.replace(tag, getStartTagHTML(element.getStartTag())); |
|
} else if (tag.getTagType() == EndTagType.NORMAL) { |
|
if (tag.getElement() == null) |
|
return false; // reject end tags that aren't associated with a start tag |
|
if (elementName == HTMLElementName.LI && !isValidLITag(tag)) |
|
return false; // reject invalid LI tags |
|
outputDocument.replace(tag, getEndTagHTML(elementName)); |
|
} else { |
|
return false; // reject abnormal tags |
|
} |
|
return true; |
|
} |
|
|
|
private static boolean isValidLITag(Tag tag) { |
|
Element parentElement = tag.getElement().getParentElement(); |
|
if (parentElement == null) |
|
return false; // ignore LI elements without a parent |
|
if (parentElement.getStartTag().getUserData() != VALID_MARKER) |
|
return false; // ignore LI elements who's parent is not valid |
|
return parentElement.getName() == HTMLElementName.UL || parentElement.getName() == HTMLElementName.OL; // only |
|
// accept |
|
// LI |
|
// tags |
|
// who's |
|
// immediate |
|
// parent |
|
// is UL |
|
// or |
|
// OL. |
|
} |
|
|
|
private static void reencodeTextSegment(net.htmlparser.jericho.Source source, OutputDocument outputDocument, |
|
int begin, int end, boolean formatWhiteSpace) { |
|
if (begin >= end) |
|
return; |
|
Segment textSegment = new Segment(source, begin, end); |
|
String decodedText = CharacterReference.decode(textSegment); |
|
String encodedText = formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText) |
|
: CharacterReference.encode(decodedText); |
|
outputDocument.replace(textSegment, encodedText); |
|
} |
|
|
|
private static CharSequence getStartTagHTML(StartTag startTag) { |
|
// tidies and filters out non-approved attributes |
|
StringBuilder sb = new StringBuilder(); |
|
sb.append('<').append(startTag.getName()); |
|
for (Attribute attribute : startTag.getAttributes()) { |
|
if (VALID_ATTRIBUTE_NAMES.contains(attribute.getKey())) { |
|
sb.append(' ').append(attribute.getName()); |
|
if (attribute.getValue() != null) { |
|
sb.append("=\""); |
|
if ("style".equals(attribute.getKey())) { |
|
String[] properties = attribute.getValue().split(";"); |
|
for (String property : properties) { |
|
String name = property.split(":")[0].trim(); |
|
if (VALID_PROPERTY_NAMES.contains(name)) { |
|
sb.append(CharacterReference.encode(property)); |
|
sb.append("; "); |
|
} |
|
} |
|
} else { |
|
sb.append(CharacterReference.encode(attribute.getValue())); |
|
} |
|
sb.append('"'); |
|
} |
|
} |
|
} |
|
if (startTag.getElement().getEndTag() == null |
|
&& !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName())) |
|
sb.append(" /"); |
|
sb.append('>'); |
|
return sb; |
|
} |
|
|
|
private static String getEndTagHTML(String tagName) { |
|
return "</" + tagName + '>'; |
|
} |
|
|
|
}
|
|
|