knu project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

294 lines
14 KiB

/*********************************************************************************************************
* 프로그램명 : TagFilter.java 프로그램설명 : 프로젝트와 관련된 정보를 얻을수 있는 class 작성자 : 강원중 작성일 : 2004.01.06 변경일 : 2003.11.30
**********************************************************************************************************/
package kr.co.kihyun.text.html;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.htmlparser.jericho.Attribute;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.EndTagType;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.HTMLElements;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.Tag;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
public class Html extends Tidy {
/**
*
*/
private static final long serialVersionUID = 1L;
public static final String width = "hh";
public static final String height = "height";
public static final String tdWidth = "";
public static final String tdHeight = "25";
//50.public static을 통한 선언 오류(CWE-500) : Delete by YOUNGJUN,CHO
//public static String tdOption = " ";
//------------------------------------------------
// public static String titleBGColor = "#E0E0E0";
public static final String titleBGColor = "#f3f6fc";
public static final String inputBGColor = "#FFFFFF";
public static final String titleFontColor = "#404040";
public static final String strFontColor = "#4F484F";
public static final String intFontColor = "#4F484F";
public static final String dynaTotLineFontColor = "red"; // 동적표의 마지막 한 라인 집계
// list of HTML elements that will be retained in the final output:
private static final Set<String> VALID_ELEMENT_NAMES = new HashSet<String>(Arrays.asList(new String[] {
HTMLElementName.A, HTMLElementName.ABBR, HTMLElementName.ACRONYM, HTMLElementName.ADDRESS,
HTMLElementName.APPLET, HTMLElementName.AREA, HTMLElementName.B, HTMLElementName.BASE,
HTMLElementName.BASEFONT, HTMLElementName.BDO, HTMLElementName.BIG, HTMLElementName.BLOCKQUOTE,
HTMLElementName.BODY, HTMLElementName.BR, HTMLElementName.BUTTON, HTMLElementName.CAPTION,
HTMLElementName.CENTER, HTMLElementName.CITE, HTMLElementName.CODE, HTMLElementName.COL,
HTMLElementName.COLGROUP, HTMLElementName.DD, HTMLElementName.DEL, HTMLElementName.DFN,
HTMLElementName.DIR, HTMLElementName.DIV, HTMLElementName.DL, HTMLElementName.DT, HTMLElementName.EM,
HTMLElementName.FIELDSET, HTMLElementName.FONT, HTMLElementName.FORM, HTMLElementName.FRAME,
HTMLElementName.FRAMESET, HTMLElementName.H1, HTMLElementName.H2, HTMLElementName.H3, HTMLElementName.H4,
HTMLElementName.H5, HTMLElementName.H6, HTMLElementName.HEAD, HTMLElementName.HR, HTMLElementName.HTML,
HTMLElementName.I, HTMLElementName.IFRAME, HTMLElementName.IMG, HTMLElementName.INPUT, HTMLElementName.INS,
HTMLElementName.ISINDEX, HTMLElementName.KBD, HTMLElementName.LABEL, HTMLElementName.LEGEND,
HTMLElementName.LI, HTMLElementName.LINK, HTMLElementName.MAP, HTMLElementName.MENU, HTMLElementName.META,
HTMLElementName.NOFRAMES, HTMLElementName.NOSCRIPT, HTMLElementName.OBJECT, HTMLElementName.OL,
HTMLElementName.OPTGROUP, HTMLElementName.OPTION, HTMLElementName.P, HTMLElementName.PARAM,
HTMLElementName.PRE, HTMLElementName.Q, HTMLElementName.S, HTMLElementName.SAMP, HTMLElementName.SCRIPT,
HTMLElementName.SELECT, HTMLElementName.SMALL, HTMLElementName.SPAN, HTMLElementName.STRIKE,
HTMLElementName.STRONG, HTMLElementName.STYLE, HTMLElementName.SUB, HTMLElementName.SUP,
HTMLElementName.TABLE, HTMLElementName.TBODY, HTMLElementName.TD, HTMLElementName.TEXTAREA,
HTMLElementName.TFOOT, HTMLElementName.TH, HTMLElementName.THEAD, HTMLElementName.TITLE,
HTMLElementName.TR, HTMLElementName.TT, HTMLElementName.U, HTMLElementName.UL, HTMLElementName.VAR }));
// list of HTML attributes that will be retained in the final output:
private static final Set<String> VALID_ATTRIBUTE_NAMES = new HashSet<String>(Arrays.asList(new String[] { "id",
"class", "hreflang", "title", "name", "rel", "rev", "type", "defer", "language", "xml:space", "style",
"dir", "lang", "xml:lang", "xmlns", "ismap", "charset", "coords", "target", "href", "nohref", "size",
"color", "face", "cite", "link", "alink", "vlink", "media", "shape", "height", "width", "align", "valign",
"alt", "start", "label", "multiple", "archive", "code", "codebase", "codetype", "classid", "data",
"declare", "standby", "hspace", "object", "vspace", "background", "bgcolor", "text", "disabled", "value",
"valuetype", "char", "charoff", "span", "datetime", "compact", "profile", "noshade", "action", "accept",
"accept-charset", "enctype", "method", "border", "ismap", "usemap", "frameborder", "longdesc",
"marginheight", "marginwidth", "scrolling", "src", "checked", "maxlength", "readonly", "prompt", "for",
"content", "http-equiv", "scheme", "cellpadding", "cellspacing", "frame", "summary", "abbr", "axis",
"colspan", "headers", "nowrap", "rowspan", "scope", "cols", "rows", "accesskey", "tabindex", "onload",
"onunload", "onblur", "onchange", "onfocus", "onreset", "onselect", "onsubmit", "onabort", "onkeydown",
"onkeypress", "onkeyup", "onclick", "ondbclick", "onmousedown", "onmousemove", "onmouseout", "onmouseover",
"onmouseup" }));
// list of HTML properties that will be retained in the final output:
private static final Set<String> VALID_PROPERTY_NAMES = new HashSet<String>(Arrays.asList(new String[] {
"font-family", "font-family", "font-variant", "font-weight", "font-size", "font", "color",
"background-color", "background-image", "background-repeat", "background-attachment",
"background-position", "background", "word-spacing", "letter-spacing", "text-decoration", "vertical-align",
"text-transform", "text-align", "text-indent", "line-height", "margin-top", "margin-right",
"margin-bottom", "margin-left", "margin", "padding-top", "padding-right", "padding-bottom", "padding-left",
"padding", "border-top-width", "border-right-width", "border-bottom-width", "border-left-width",
"border-width", "border-color", "border-style", "border-top", "border-right", "border-bottom",
"border-left", "border", "width", "height", "float", "float", "display", "white-space", "list-style-type",
"list-style-image", "list-style-position", "list-style" }));
private static final Object VALID_MARKER = new Object();
// public static final String DOC_TYPE = "\"-//W3C//DTD XHTML 1.0 Transitional//EN\" \""+Moumi.getDtdUri()+"\"";
public static final String DOC_TYPE = "\"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://localhost:7001/dtd/loose.dtd\"";
private net.htmlparser.jericho.Source source = null;
private OutputDocument outputDocument = null;
public Html(InputStream in) {
this(new InputStreamReader(in));
}
public Html(String in) {
this(new StringReader(in));
}
public Html(Reader reader) {
super();
try {
setTidyMark(false);
setDocType(DOC_TYPE);
setDropEmptyParas(true);
setDropProprietaryAttributes(true);
setHideComments(true);
source = new net.htmlparser.jericho.Source(reader);
source.fullSequentialParse();
outputDocument = new OutputDocument(source);
sanitise(false, true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/*
* override setDocType, dtd까지 auto로 가능하도록.
*/
public Document parseDOM() {
StringWriter out = new StringWriter();
Document doc = parseDOM(new StringReader(outputDocument.toString()), out);
out.getBuffer().toString();
return doc;
}
public void encodeInvalidMarkup() {
encodeInvalidMarkup(false);
}
public void encodeInvalidMarkup(boolean formatWhiteSpace) {
sanitise(formatWhiteSpace, false);
}
public void stripInvalidMarkup() {
stripInvalidMarkup(false);
}
public void stripInvalidMarkup(boolean formatWhiteSpace) {
sanitise(formatWhiteSpace, true);
}
// TODO: implement
/*
* public void toPDF(OutputStream out) { toPDF(out, defaultStyleSheet); }
*
* public void toPDF(OutputStream out, File styleSheet) { out = new BufferedOutputStream(out); try { Fop fop =
* fopFactory.newFop(MimeConstants.MIME_PDF, foUserAgent, out); TransformerFactory factory =
* TransformerFactory.newInstance(); Transformer transformer = factory.newTransformer(new StreamSource(styleSheet));
* transformer.setParameter("versionParam", "2.0"); LOG.debug("EXPORTING FROM:\n{}", domBuf); Source src = new
* StreamSource(new StringReader(domBuf)); // Result res = new SAXResult(fop.getDefaultHandler()); Result res = new
* StreamResult(out); transformer.transform(src, res); out.close(); } catch (FOPException e) { e.printStackTrace();
* } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerException e) {
* e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
*/
private void sanitise(boolean formatWhiteSpace, boolean stripInvalidElements) {
List<Tag> tags = source.getAllTags();
int pos = 0;
for (Tag tag : tags) {
if (processTag(tag, outputDocument)) {
tag.setUserData(VALID_MARKER);
} else {
if (!stripInvalidElements)
continue; // element will be encoded along with surrounding text
outputDocument.remove(tag);
}
reencodeTextSegment(source, outputDocument, pos, tag.getBegin(), formatWhiteSpace);
pos = tag.getEnd();
}
reencodeTextSegment(source, outputDocument, pos, source.getEnd(), formatWhiteSpace);
}
private static boolean processTag(Tag tag, OutputDocument outputDocument) {
String elementName = tag.getName();
if (!VALID_ELEMENT_NAMES.contains(elementName))
return false;
if (tag.getTagType() == StartTagType.NORMAL) {
Element element = tag.getElement();
if (HTMLElements.getEndTagRequiredElementNames().contains(elementName)) {
if (element.getEndTag() == null)
return false; // refect start tag if its required end tag is missing
} else if (HTMLElements.getEndTagOptionalElementNames().contains(elementName)) {
if (elementName == HTMLElementName.LI && !isValidLITag(tag))
return false; // reject invalid LI tags
if (element.getEndTag() == null)
outputDocument.insert(element.getEnd(), getEndTagHTML(elementName)); // insert optional end tag if
// it is missing
}
outputDocument.replace(tag, getStartTagHTML(element.getStartTag()));
} else if (tag.getTagType() == EndTagType.NORMAL) {
if (tag.getElement() == null)
return false; // reject end tags that aren't associated with a start tag
if (elementName == HTMLElementName.LI && !isValidLITag(tag))
return false; // reject invalid LI tags
outputDocument.replace(tag, getEndTagHTML(elementName));
} else {
return false; // reject abnormal tags
}
return true;
}
private static boolean isValidLITag(Tag tag) {
Element parentElement = tag.getElement().getParentElement();
if (parentElement == null)
return false; // ignore LI elements without a parent
if (parentElement.getStartTag().getUserData() != VALID_MARKER)
return false; // ignore LI elements who's parent is not valid
return parentElement.getName() == HTMLElementName.UL || parentElement.getName() == HTMLElementName.OL; // only
// accept
// LI
// tags
// who's
// immediate
// parent
// is UL
// or
// OL.
}
private static void reencodeTextSegment(net.htmlparser.jericho.Source source, OutputDocument outputDocument,
int begin, int end, boolean formatWhiteSpace) {
if (begin >= end)
return;
Segment textSegment = new Segment(source, begin, end);
String decodedText = CharacterReference.decode(textSegment);
String encodedText = formatWhiteSpace ? CharacterReference.encodeWithWhiteSpaceFormatting(decodedText)
: CharacterReference.encode(decodedText);
outputDocument.replace(textSegment, encodedText);
}
private static CharSequence getStartTagHTML(StartTag startTag) {
// tidies and filters out non-approved attributes
StringBuilder sb = new StringBuilder();
sb.append('<').append(startTag.getName());
for (Attribute attribute : startTag.getAttributes()) {
if (VALID_ATTRIBUTE_NAMES.contains(attribute.getKey())) {
sb.append(' ').append(attribute.getName());
if (attribute.getValue() != null) {
sb.append("=\"");
if ("style".equals(attribute.getKey())) {
String[] properties = attribute.getValue().split(";");
for (String property : properties) {
String name = property.split(":")[0].trim();
if (VALID_PROPERTY_NAMES.contains(name)) {
sb.append(CharacterReference.encode(property));
sb.append("; ");
}
}
} else {
sb.append(CharacterReference.encode(attribute.getValue()));
}
sb.append('"');
}
}
}
if (startTag.getElement().getEndTag() == null
&& !HTMLElements.getEndTagOptionalElementNames().contains(startTag.getName()))
sb.append(" /");
sb.append('>');
return sb;
}
private static String getEndTagHTML(String tagName) {
return "</" + tagName + '>';
}
}