org.archive.io.warc.WARCReaderFactory.get()方法的使用及代码示例

x33g5p2x  于2022-02-03 转载在 其他  
字(8.9k)|赞(0)|评价(0)|浏览(123)

本文整理了Java中org.archive.io.warc.WARCReaderFactory.get()方法的一些代码示例,展示了WARCReaderFactory.get()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WARCReaderFactory.get()方法的具体详情如下:
包路径:org.archive.io.warc.WARCReaderFactory
类名称:WARCReaderFactory
方法名:get

WARCReaderFactory.get介绍

暂无

代码示例

代码示例来源:origin: internetarchive/heritrix3

public void transform(final File warc, final File dir, final String prefix,
    final String suffix, final boolean force)
throws IOException, java.text.ParseException {
  FileUtils.assertReadable(warc);
  FileUtils.assertReadable(dir);
  WARCReader reader = WARCReaderFactory.get(warc);
  List<String> metadata =  new ArrayList<String>();
  metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
    this.getClass().getName() + "/" + getRevision());
  ARCWriter writer = 
    new ARCWriter(
        new AtomicInteger(),
        new WriterPoolSettingsData(
            prefix, 
            suffix, 
            -12, 
            reader.isCompressed(), 
            Arrays.asList(new File [] {dir}), 
            metadata));
  transform(reader, writer);
}

代码示例来源:origin: ViDA-NYU/ache

private WARCReader openFile(Path filePath) throws IOException {
  return WARCReaderFactory.get(filePath.toFile());
}

代码示例来源:origin: org.netpreserve.openwayback/openwayback-core

/**
 * @param warc
 * @return Iterator of SearchResults for input arc File
 * @throws IOException
 */
public CloseableIterator<CaptureSearchResult> iterator(File warc)
    throws IOException {
  return iterator(WARCReaderFactory.get(warc));
}
/**

代码示例来源:origin: lintool/warcbase

/**
 * Converts raw bytes into an {@code WARCRecord}.
 *
 * @param bytes raw bytes
 * @return parsed {@code WARCRecord}
 * @throws IOException
 */
public static WARCRecord fromBytes(byte[] bytes) throws IOException {
 WARCReader reader = (WARCReader) WARCReaderFactory.get("",
   new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
 return (WARCRecord) reader.get();
}

代码示例来源:origin: iipc/openwayback

/**
 * @param warc
 * @return Iterator of SearchResults for input arc File
 * @throws IOException
 */
public CloseableIterator<CaptureSearchResult> iterator(File warc)
    throws IOException {
  return iterator(WARCReaderFactory.get(warc));
}
/**

代码示例来源:origin: Smerity/cc-warc-examples

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit split = (FileSplit) inputSplit;
  Configuration conf = context.getConfiguration();
  Path path = split.getPath();
  FileSystem fs = path.getFileSystem(conf);
  fsin = fs.open(path);
  arPath = path.getName();
  ar = WARCReaderFactory.get(path.getName(), fsin, true);
}

代码示例来源:origin: iipc/openwayback

/**
 * @param pathOrUrl
 * @return Iterator of SearchResults for input pathOrUrl
 * @throws IOException
 */
public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl)
    throws IOException {
  File f = new File(pathOrUrl);
  if(f.isFile()) {
    return iterator(WARCReaderFactory.get(f));
  } else {
    return iterator(WARCReaderFactory.get(pathOrUrl));
  }
}
/**

代码示例来源:origin: org.netpreserve.openwayback/openwayback-core

/**
 * @param pathOrUrl
 * @return Iterator of SearchResults for input pathOrUrl
 * @throws IOException
 */
public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl)
    throws IOException {
  File f = new File(pathOrUrl);
  if(f.isFile()) {
    return iterator(WARCReaderFactory.get(f));
  } else {
    return iterator(WARCReaderFactory.get(pathOrUrl));
  }
}
/**

代码示例来源:origin: org.netpreserve.commons/commons-web

protected ArchiveReader getArchiveReader(final File f,
  final long offset)
throws IOException {
  if (ARCReaderFactory.isARCSuffix(f.getName())) {
    return ARCReaderFactory.get(f, true, offset);
  } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
    return WARCReaderFactory.get(f, offset);
  }
  throw new IOException("Unknown file extension (Not ARC nor WARC): "
    + f.getName());
}

代码示例来源:origin: iipc/webarchive-commons

protected ArchiveReader getArchiveReader(final File f,
  final long offset)
throws IOException {
  if (ARCReaderFactory.isARCSuffix(f.getName())) {
    return ARCReaderFactory.get(f, true, offset);
  } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
    return WARCReaderFactory.get(f, offset);
  }
  throw new IOException("Unknown file extension (Not ARC nor WARC): "
    + f.getName());
}

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

protected ArchiveReader getArchiveReader(final File f,
  final long offset)
throws IOException {
  if (ARCReaderFactory.isARCSuffix(f.getName())) {
    return ARCReaderFactory.get(f, true, offset);
  } else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
    return WARCReaderFactory.get(f, offset);
  }
  throw new IOException("Unknown file extension (Not ARC nor WARC): "
    + f.getName());
}

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

protected ArchiveReader getArchiveReader(final String id, 
    final InputStream is, final boolean atFirstRecord)
throws IOException {
  final InputStream stream = is; 
  if (ARCReaderFactory.isARCSuffix(id)) {
    return ARCReaderFactory.get(id, stream, atFirstRecord);
  } else if (WARCReaderFactory.isWARCSuffix(id)) {
    return WARCReaderFactory.get(id, stream, atFirstRecord);
  }
  throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}

代码示例来源:origin: org.netpreserve.commons/commons-web

protected ArchiveReader getArchiveReader(final String id, 
    final InputStream is, final boolean atFirstRecord)
throws IOException {
  final InputStream stream = is; 
  if (ARCReaderFactory.isARCSuffix(id)) {
    return ARCReaderFactory.get(id, stream, atFirstRecord);
  } else if (WARCReaderFactory.isWARCSuffix(id)) {
    return WARCReaderFactory.get(id, stream, atFirstRecord);
  }
  throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}

代码示例来源:origin: iipc/webarchive-commons

protected ArchiveReader getArchiveReader(final String id, 
    final InputStream is, final boolean atFirstRecord)
throws IOException {
  final InputStream stream = is; 
  if (ARCReaderFactory.isARCSuffix(id)) {
    return ARCReaderFactory.get(id, stream, atFirstRecord);
  } else if (WARCReaderFactory.isWARCSuffix(id)) {
    return WARCReaderFactory.get(id, stream, atFirstRecord);
  }
  throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}

代码示例来源:origin: lintool/warcbase

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
 FileSplit split = (FileSplit) genericSplit;
 Configuration job = context.getConfiguration();
 start = split.getStart();
 end = start + split.getLength();
 final Path file = split.getPath();
 FileSystem fs = file.getFileSystem(job);
 FSDataInputStream fileIn = fs.open(split.getPath());
 reader = (WARCReader) WARCReaderFactory.get(split.getPath().toString(),
   new BufferedInputStream(fileIn), true);
 iter = reader.iterator();
 //reader = (ARCReader) ARCReaderFactory.get(split.getPath().toString(), fileIn, true);
 this.pos = start;
}

代码示例来源:origin: iipc/openwayback

public static Resource getResource(File file, long offset)
    throws IOException, ResourceNotAvailableException {
  Resource r = null;
  String name = file.getName();
  if (name.endsWith(ArcWarcFilenameFilter.OPEN_SUFFIX)) {
    name = name.substring(0, name.length()
        - ArcWarcFilenameFilter.OPEN_SUFFIX.length());
  }
  RandomAccessFile raf = new RandomAccessFile(file, "r");
  raf.seek(offset);
  InputStream is = new FileInputStream(raf.getFD());
  String fPath = file.getAbsolutePath();
  if (isArc(name)) {
    ArchiveReader reader = ARCReaderFactory.get(name, is, false);
    r = ARCArchiveRecordToResource(reader.get(), reader);
  } else if (isWarc(name)) {
    ArchiveReader reader = WARCReaderFactory.get(name, is, false);
    r = WARCArchiveRecordToResource(reader.get(), reader);
  } else {
    is.close();
    raf.close();
    throw new ResourceNotAvailableException("Unknown extension");
  }
  return r;
}
public static Resource getResource(URL url, long offset)

代码示例来源:origin: org.netpreserve.commons/webarchive-commons

/**
 * Generate a CDX index file for an ARC file.
 *
 * @param urlOrPath The ARC file to generate a CDX index for
 * @throws IOException
 * @throws java.text.ParseException
 */
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
  WARCReader r = WARCReaderFactory.get(urlOrPath);
  r.setStrict(false);
  r.setDigest(true);
  output(r, CDX_FILE);
}

代码示例来源:origin: org.netpreserve.commons/commons-web

/**
 * Generate a CDX index file for an ARC file.
 *
 * @param urlOrPath The ARC file to generate a CDX index for
 * @throws IOException
 * @throws java.text.ParseException
 */
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
  WARCReader r = WARCReaderFactory.get(urlOrPath);
  r.setStrict(false);
  r.setDigest(true);
  output(r, CDX_FILE);
}

代码示例来源:origin: iipc/webarchive-commons

/**
 * Generate a CDX index file for an ARC file.
 *
 * @param urlOrPath The ARC file to generate a CDX index for
 * @throws IOException
 * @throws java.text.ParseException
 */
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
  WARCReader r = WARCReaderFactory.get(urlOrPath);
  r.setStrict(false);
  r.setDigest(true);
  output(r, CDX_FILE);
}

代码示例来源:origin: org.archive.heritrix/heritrix-commons

public void transform(final File warc, final File dir, final String prefix,
    final String suffix, final boolean force)
throws IOException, java.text.ParseException {
  FileUtils.assertReadable(warc);
  FileUtils.assertReadable(dir);
  WARCReader reader = WARCReaderFactory.get(warc);
  List<String> metadata =  new ArrayList<String>();
  metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
    this.getClass().getName() + "/" + getRevision());
  ARCWriter writer = 
    new ARCWriter(
        new AtomicInteger(),
        new WriterPoolSettingsData(
            prefix, 
            suffix, 
            -12, 
            reader.isCompressed(), 
            Arrays.asList(new File [] {dir}), 
            metadata));
  transform(reader, writer);
}

相关文章

微信公众号

最新文章

更多