org.htmlparser.Parser.visitAllNodesWith()方法的使用及代码示例

x33g5p2x  于2022-01-26 转载在 其他  
字(8.0k)|赞(0)|评价(0)|浏览(102)

本文整理了Java中org.htmlparser.Parser.visitAllNodesWith()方法的一些代码示例,展示了Parser.visitAllNodesWith()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser.visitAllNodesWith()方法的具体详情如下:
包路径:org.htmlparser.Parser
类名称:Parser
方法名:visitAllNodesWith

Parser.visitAllNodesWith介绍

[英]Apply the given visitor to the current page. The visitor is passed to the accept() method of each node in the page in a depth first traversal. The visitor beginParsing() method is called prior to processing the page and finishedParsing() is called after the processing.
[中]将给定的访问者应用到当前页面。在深度优先遍历中,访问者被传递到页面中每个节点的accept()方法。在处理页面之前调用visitorbeginParsing()方法,在处理之后调用finishedParsing()

代码示例

代码示例来源:origin: com.bbossgroups/bboss-htmlparser

/**
 * Extract the text from a page.
 * @return The textual contents of the page.
 * @exception ParserException If a parse error occurs.
 */
protected String extractStrings ()
  throws
    ParserException
{
  String ret;
  mParser.visitAllNodesWith (this);
  ret = mBuffer.toString ();
  mBuffer = new StringBuilder(4096);
  return (ret);
}

代码示例来源:origin: org.htmlparser/htmlparser

/**
 * Extract the text from a page.
 * @return The textual contents of the page.
 * @exception ParserException If a parse error occurs.
 */
protected String extractStrings ()
  throws
    ParserException
{
  String ret;
  mCollapseState = 0;
  mParser.visitAllNodesWith (this);
  ret = mBuffer.toString ();
  mBuffer = new StringBuffer(4096);
  return (ret);
}

代码示例来源:origin: fhopf/akka-crawler-example

@Override
public PageContent fetchPageContent(String url) {
  logger.debug("Fetching {}", url);
  try {
    Parser parser = new Parser(url);
    PageContentVisitor visitor = new PageContentVisitor(baseUrl, url);
    parser.visitAllNodesWith(visitor);
    
    return visitor.getContent();
  } catch (ParserException ex) {
    throw new IllegalStateException(ex);
  }
}

代码示例来源:origin: omegat-org/omegat

@Override
public void processFile(BufferedReader infile, BufferedWriter outfile, FilterContext fc) throws IOException,
    TranslationException {
  StringBuilder all = null;
  try {
    all = new StringBuilder();
    char[] cbuf = new char[1000];
    int len = -1;
    while ((len = infile.read(cbuf)) > 0) {
      all.append(cbuf, 0, len);
    }
  } catch (OutOfMemoryError e) {
    // out of memory?
    all = null;
    System.gc();
    throw new IOException(OStrings.getString("HHC__FILE_TOO_BIG"));
  }
  Parser parser = new Parser();
  try {
    parser.setInputHTML(all.toString());
    parser.visitAllNodesWith(new HHCFilterVisitor(this, outfile));
  } catch (ParserException pe) {
    System.out.println(pe);
  }
}

代码示例来源:origin: oaqa/knn4qa

public PostCleaner(String html, int minCodeChars, boolean excludeCode) {
 try {
  Parser htmlParser = Parser.createParser(html, "utf8");  

  PostCleanerVisitor res = new PostCleanerVisitor(minCodeChars, excludeCode);      
  htmlParser.visitAllNodesWith(res);      
  mText = res.getText();
 } catch (ParserException e) {      
  System.err.println(" Parser exception: " + e + " trying simple conversion");
  // Plan B!!!
  mText = PostCleanerVisitor.simpleProc(html);
 }    
}

代码示例来源:origin: org.htmlparser/htmlparser

mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());
mBuffer = new StringBuffer (4096);
mCollapseState = 0;
mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());

代码示例来源:origin: org.exoplatform.core/exo.core.component.document

parser.visitAllNodesWith(sb);

代码示例来源:origin: com.bbossgroups/bboss-htmlparser

mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());
mParser.visitAllNodesWith (this);
updateStrings (mBuffer.toString ());

代码示例来源:origin: com.bbossgroups.pdp/pdp-cms

parser.visitAllNodesWith(this);

代码示例来源:origin: org.opencms/opencms-core

/**
 * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
 */
public String process(String html, String encoding) throws ParserException {
  m_result = new StringBuffer();
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  // initialize the page with the given char set
  Page page = new Page(html, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  if ((m_noAutoCloseTags != null) && (m_noAutoCloseTags.size() > 0)) {
    // Degrade Composite tags that do have children in the DOM tree
    // to simple single tags: This allows to finish this tag with opened HTML tags without the effect
    // that html parser will generate the closing tags.
    PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
    lexer.setNodeFactory(factory);
  }
  // process the page using the given visitor
  parser.visitAllNodesWith(this);
  // return the result
  return getResult();
}

代码示例来源:origin: org.opencms/opencms-solr

/**
 * @see org.opencms.util.I_CmsHtmlNodeVisitor#process(java.lang.String, java.lang.String)
 */
public String process(String html, String encoding) throws ParserException {
  m_result = new StringBuffer();
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  // initialize the page with the given char set
  Page page = new Page(html, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  if (m_noAutoCloseTags != null && m_noAutoCloseTags.size() > 0) {
    // Degrade Composite tags that do have children in the DOM tree 
    // to simple single tags: This allows to finish this tag with opened HTML tags without the effect 
    // that html parser will generate the closing tags. 
    PrototypicalNodeFactory factory = configureNoAutoCorrectionTags();
    lexer.setNodeFactory(factory);
  }
  // process the page using the given visitor
  parser.visitAllNodesWith(this);
  // return the result
  return getResult();
}

代码示例来源:origin: dbiir/rainbow

HtmlPage page = new HtmlPage(parser);
try {
  parser.visitAllNodesWith(page);
} catch (ParserException e) {
  log.error("visit page error:", e);

代码示例来源:origin: org.opencms/org.opencms.workplace.tools.content

parser.setLexer(lexer);
parser.visitAllNodesWith(this);

代码示例来源:origin: org.opencms/opencms-core

/**
 * Extract the text from a HTML page.<p>
 *
 * @param in the html content input stream
 * @param encoding the encoding of the content
 *
 * @return the extracted text from the page
 * @throws ParserException if the parsing of the HTML failed
 * @throws UnsupportedEncodingException if the given encoding is not supported
 */
public static String extractText(InputStream in, String encoding)
throws ParserException, UnsupportedEncodingException {
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(in, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  StringBean stringBean = new StringBean();
  parser.visitAllNodesWith(stringBean);
  String result = stringBean.getStrings();
  return result == null ? "" : result;
}

代码示例来源:origin: com.bbossgroups.pdp/pdp-cms

parser.visitAllNodesWith(this);

代码示例来源:origin: org.opencms/opencms-solr

/**
 * Extract the text from a HTML page.<p>
 *
 * @param in the html content input stream
 * @param encoding the encoding of the content
 *
 * @return the extracted text from the page
 * @throws ParserException if the parsing of the HTML failed
 * @throws UnsupportedEncodingException if the given encoding is not supported
 */
public static String extractText(InputStream in, String encoding)
throws ParserException, UnsupportedEncodingException {
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(in, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  StringBean stringBean = new StringBean();
  parser.visitAllNodesWith(stringBean);
  String result = stringBean.getStrings();
  return result == null ? "" : result;
}

代码示例来源:origin: com.bbossgroups.pdp/pdp-cms

/**
 * Extract the text from a HTML page.<p>
 *
 * @param in the html content input stream
 * @param encoding the encoding of the content
 *
 * @return the extracted text from the page
 * @throws ParserException if the parsing of the HTML failed
 * @throws UnsupportedEncodingException if the given encoding is not supported
 */
public static String extractText(InputStream in, String encoding)
throws ParserException, UnsupportedEncodingException {
  Parser parser = new Parser();
  Lexer lexer = new Lexer();
  Page page = new Page(in, encoding);
  lexer.setPage(page);
  parser.setLexer(lexer);
  StringBean stringBean = new StringBean();
  parser.visitAllNodesWith(stringBean);
  return stringBean.getStrings();
}

代码示例来源:origin: fhopf/akka-crawler-example

@Test
  public void testLinkExtraction() throws ParserException {
    Parser parser = new Parser("http://synyx.de");
    ObjectFindingVisitor visitor = new ObjectFindingVisitor(LinkTag.class);
    parser.visitAllNodesWith(visitor);
    Node[] links = visitor.getTags();
    // TODO this could use some more meaningful assertions
    assertTrue(links.length > 0);
    for (int i = 0; i < links.length; i++) {
      LinkTag linkTag = (LinkTag) links[i];
      System.out.print("\"" + linkTag.getLinkText() + "\" => ");
      System.out.println(linkTag.getLink());
    }
  }
}

代码示例来源:origin: org.opencms/opencms-core

parser.visitAllNodesWith(visitor);

代码示例来源:origin: org.opencms/opencms-solr

parser.visitAllNodesWith(visitor);

相关文章