public static Document transferByNeko(InputStream stream, String charset) { if (stream == null) return null; if(StringUtils.isEmpty(charset)){ charset = DEFAULT_CHARSET; } //NEKOHTML的DOMParser会将html标签转化成大写,是否设置下面的配置都没有意义,解决办法是需要使用xerces的DOMParser// DOMParser domParser = new DOMParser();// Document doc = null;// ByteArrayOutputStream byteOs = null;// Writer writer = null;// InputSource inputSource = null;// DocumentType documentType = null;// org.w3c.dom.Document document = null;// DOMReader domReader = null;// try {// domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");// domParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");// domParser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");//// domParser.setFeature("http://xml.org/sax/features/namespaces", false);// domParser.setFeature("http://cyberneko.org/html/features/balance-tags", true);// domParser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-comment-delims", false);//// byteOs = new ByteArrayOutputStream();// writer = new Writer(byteOs, charset);// XMLDocumentFilter domFilter[] = {// writer// };// domParser.setProperty("http://cyberneko.org/html/properties/filters", domFilter);// inputSource = new InputSource(new InputStreamReader(stream, Charset.forName(charset)));// domParser.parse(inputSource);// document = domParser.getDocument();// documentType = document.getDoctype();// if (documentType != null)// document.removeChild(documentType);// domReader = new DOMReader();// doc = domReader.read(document);// } catch (SAXNotRecognizedException e) {// e.printStackTrace();// } catch (SAXNotSupportedException e) {// e.printStackTrace();// } catch (UnsupportedEncodingException e) {// e.printStackTrace();// } catch (SAXException e) {// e.printStackTrace();// } catch (IOException e) {// e.printStackTrace();// }finally{// IOUtils.closeQuietly(byteOs);// IOUtils.closeQuietly(stream);// } //采用xerces的DOMParser Document doc = null; DocumentType documentType = null; org.w3c.dom.Document document = null; DOMReader domReader = null; ByteArrayOutputStream byteOs = null; Writer writer = null; InputSource inputSource = null; try { HTMLConfiguration htmlConfiguration = new HTMLConfiguration(); htmlConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems","lower"); org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser(htmlConfiguration); inputSource = new InputSource(new InputStreamReader(stream, Charset.forName(charset))); parser.parse(inputSource); document = parser.getDocument(); documentType = document.getDoctype(); if (documentType != null) document.removeChild(documentType); domReader = new DOMReader(); doc = domReader.read(document); } catch (SAXException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return doc; }
posted on 2018-03-23 08:23 阅读( ...) 评论( ...)