org.apache.tika.parser.Parser.parse()方法的使用及代码示例

x33g5p2x  于2022-01-26 转载在 其他  
字(9.3k)|赞(0)|评价(0)|浏览(259)

本文整理了Java中org.apache.tika.parser.Parser.parse()方法的一些代码示例,展示了Parser.parse()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Parser.parse()方法的具体详情如下:
包路径:org.apache.tika.parser.Parser
类名称:Parser
方法名:parse

Parser.parse介绍

[英]Parses a document stream into a sequence of XHTML SAX events. Fills in related document metadata in the given metadata object.

The given document stream is consumed but not closed by this method. The responsibility to close the stream remains on the caller.

Information about the parsing context can be passed in the context parameter. See the parser implementations for the kinds of context information they expect.
[中]将文档流解析为一系列XHTML SAX事件。填写给定元数据对象中的相关文档元数据。
给定的文档流已被此方法使用,但未被关闭。关闭流的责任仍在调用方身上。
有关解析上下文的信息可以在context参数中传递。请参阅解析器实现,了解它们所期望的上下文信息的种类。

代码示例

代码示例来源:origin: yasserg/crawler4j

public void setBinaryContent(byte[] data)
      throws TransformerConfigurationException, TikaException, SAXException, IOException {
  InputStream inputStream = new ByteArrayInputStream(data);
  ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
  try {
    TransformerHandler handler =
      getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
    AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);
    // Hacking the following line to remove Tika's inserted DocType
    this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace(
      "http://www.w3.org/1999/xhtml", "");
  } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) {
    throw e;
  }
}

代码示例来源:origin: apache/tika

public static void useHtmlParser() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new HtmlParser();
  parser.parse(stream, handler, metadata, context);
}

代码示例来源:origin: apache/tika

private void parsePage(byte[] byteObject, Parser htmlParser,
            ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException
  InputStream stream = null;
  Metadata metadata = new Metadata();
  ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
  try {
    stream = new ByteArrayInputStream(byteObject);
    htmlParser.parse(stream, handler, metadata, context);
  } catch (SAXException e) {
    throw new RuntimeException(e);
  } catch (IOException e) {
    // Pushback overflow from tagsoup
  }
}

代码示例来源:origin: apache/tika

public static void testTeeContentHandler(String filename) throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new AutoDetectParser();
  LinkContentHandler linkCollector = new LinkContentHandler();
  try (OutputStream output = new FileOutputStream(new File(filename))) {
    ContentHandler handler = new TeeContentHandler(
        new BodyContentHandler(output), linkCollector);
    parser.parse(stream, handler, metadata, context);
  }
}

代码示例来源:origin: apache/tika

@Test
public void testIgnore() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  ContentHandler handler =
      new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler();
  assertTrue(handler instanceof DefaultHandler);
  p.parse(null, handler, null, null);
  //unfortunatley, the DefaultHandler does not return "",
  assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
  //tests that no write limit exception is thrown
  p = new MockParser(100);
  handler =
      new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler();
  assertTrue(handler instanceof DefaultHandler);
  p.parse(null, handler, null, null);
  assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
}

代码示例来源:origin: apache/tika

@Test
public void testXML() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  p.parse(null, handler, new Metadata(), null);
  String extracted = handler.toString();
  assertContains("<head><title>This is the title", extracted);
  handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
  assertTrue(handler instanceof ToXMLContentHandler);
  p.parse(null, handler, null, null);

代码示例来源:origin: apache/tika

new WriteOutContentHandler(maxLength);
try {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  parser.parse(
         stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
  if (!handler.isWriteLimitReached(e)) {

代码示例来源:origin: apache/tika

/**
 * Parses the given binary stream and writes the text content
 * to the write end of the pipe. Potential exceptions (including
 * the one caused if the read end is closed unexpectedly) are
 * stored before the input stream is closed and processing is stopped.
 */
public void run() {
  try {
    ContentHandler handler = new BodyContentHandler(writer);
    parser.parse(stream, handler, metadata, context);
  } catch (Throwable t) {
    throwable = t;
  }
  try {
    stream.close();
  } catch (Throwable t) {
    if (throwable == null) {
      throwable = t;
    }
  }
  try {
    writer.close();
  } catch (Throwable t) {
    if (throwable == null) {
      throwable = t;
    }
  }
}

代码示例来源:origin: apache/tika

protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
 if (context == null) {
   context = new ParseContext();
 }
 try {
   ContentHandler handler = new ToXMLContentHandler();
   parser.parse(input, handler, metadata, context);
   return new XMLResult(handler.toString(), metadata);
 } finally {
   input.close();
 }
}

代码示例来源:origin: apache/tika

public static void process(Path path) throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
    // to the underlying Handler.
    PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
    try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) {
      parser.parse(stream, handler, metadata, new ParseContext());
    }
    String[] numbers = metadata.getValues("phonenumbers");
    Collections.addAll(phoneNumbers, numbers);
  }
}

代码示例来源:origin: apache/tika

@Test
public void testHTML() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  p.parse(null, handler, null, null);
  String extracted = handler.toString();
  assertContains("<head><title>This is the title", extracted);
  handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
  assertTrue(handler instanceof ToHTMLContentHandler);
  p.parse(null, handler, null, null);
  assertContains("This is the title", os.toByteArray());
  assertContains("aaaaaaaaaa", os.toByteArray());

代码示例来源:origin: apache/tika

public static void parseFileInputStream(String filename) throws Exception {
  Parser parser = new AutoDetectParser();
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  try (InputStream stream = new FileInputStream(new File(filename))) {
    parser.parse(stream, handler, metadata, context);
  }
}

代码示例来源:origin: apache/tika

parser.parse(
      new ByteArrayInputStream(part.bytes),
      new EmbeddedContentHandler(new BodyContentHandler(handler)),
      new Metadata(), parseContext
  );
} catch (SAXException | TikaException e) {

代码示例来源:origin: apache/tika

new WriteOutContentHandler(maxStringLength);
try {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  parser.parse(
      stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
  if (!handler.isWriteLimitReached(e)) {

代码示例来源:origin: vector4wang/spring-boot-quick

public static String handleStreamContent(byte[] file)
    throws Exception {
  Metadata md = new Metadata();
  TikaInputStream input = TikaInputStream.get(file, md);
  StringWriter textBuffer = new StringWriter();
  StringBuilder metadataBuffer = new StringBuilder();
  ContentHandler handler = new TeeContentHandler(
      getTextContentHandler(textBuffer)
  );
  parser.parse(input, handler, md, context);
  return textBuffer.toString();
}

代码示例来源:origin: apache/tika

xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(
   new BodyContentHandler(xhtml));
    meta.parse(zip, new DefaultHandler(), metadata, context);
  } else if (entry.getName().endsWith(".opf")) {
    meta.parse(zip, new DefaultHandler(), metadata, context);
  } else if (entry.getName().endsWith(".htm") || 
          entry.getName().endsWith(".html") || 
        entry.getName().endsWith(".xhtml")) {
    content.parse(zip, childHandler, metadata, context);

代码示例来源:origin: apache/jackrabbit-oak

@Override
 public Void call() throws Exception {
  getParser().parse(stream, handler, metadata, new ParseContext());
  return null;
 }
});

代码示例来源:origin: apache/tika

public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
  Metadata m = new Metadata();
  ParseContext c = new ParseContext();
  ContentHandler h = new BodyContentHandler(-1);
  c.set(Parser.class, parser);
  EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
  c.set(EmbeddedDocumentExtractor.class, ex);
  parser.parse(is, h, m, c);
}

代码示例来源:origin: apache/tika

@Test
public void testBody() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  p.parse(null, handler, null, null);
  String extracted = handler.toString();
  assertNotContains("title", extracted);
  handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
  assertTrue(handler instanceof BodyContentHandler);
  p.parse(null, handler, null, null);
  assertNotContains("title", os.toByteArray());
  assertContains("aaaaaaaaaa", os.toByteArray());

代码示例来源:origin: apache/tika

public static void parseURLStream(String address) throws Exception {
  Parser parser = new AutoDetectParser();
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
    parser.parse(stream, handler, metadata, context);
  }
}

相关文章

微信公众号

最新文章

更多