本文整理了Java中org.archive.io.warc.WARCReaderFactory.get()
方法的一些代码示例,展示了WARCReaderFactory.get()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WARCReaderFactory.get()
方法的具体详情如下:
包路径:org.archive.io.warc.WARCReaderFactory
类名称:WARCReaderFactory
方法名:get
暂无
代码示例来源:origin: internetarchive/heritrix3
public void transform(final File warc, final File dir, final String prefix,
final String suffix, final boolean force)
throws IOException, java.text.ParseException {
FileUtils.assertReadable(warc);
FileUtils.assertReadable(dir);
WARCReader reader = WARCReaderFactory.get(warc);
List<String> metadata = new ArrayList<String>();
metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
this.getClass().getName() + "/" + getRevision());
ARCWriter writer =
new ARCWriter(
new AtomicInteger(),
new WriterPoolSettingsData(
prefix,
suffix,
-12,
reader.isCompressed(),
Arrays.asList(new File [] {dir}),
metadata));
transform(reader, writer);
}
代码示例来源:origin: ViDA-NYU/ache
private WARCReader openFile(Path filePath) throws IOException {
return WARCReaderFactory.get(filePath.toFile());
}
代码示例来源:origin: org.netpreserve.openwayback/openwayback-core
/**
* @param warc
* @return Iterator of SearchResults for input arc File
* @throws IOException
*/
public CloseableIterator<CaptureSearchResult> iterator(File warc)
throws IOException {
return iterator(WARCReaderFactory.get(warc));
}
/**
代码示例来源:origin: lintool/warcbase
/**
* Converts raw bytes into an {@code WARCRecord}.
*
* @param bytes raw bytes
* @return parsed {@code WARCRecord}
* @throws IOException
*/
public static WARCRecord fromBytes(byte[] bytes) throws IOException {
WARCReader reader = (WARCReader) WARCReaderFactory.get("",
new BufferedInputStream(new ByteArrayInputStream(bytes)), false);
return (WARCRecord) reader.get();
}
代码示例来源:origin: iipc/openwayback
/**
* @param warc
* @return Iterator of SearchResults for input arc File
* @throws IOException
*/
public CloseableIterator<CaptureSearchResult> iterator(File warc)
throws IOException {
return iterator(WARCReaderFactory.get(warc));
}
/**
代码示例来源:origin: Smerity/cc-warc-examples
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = context.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
arPath = path.getName();
ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
代码示例来源:origin: iipc/openwayback
/**
* @param pathOrUrl
* @return Iterator of SearchResults for input pathOrUrl
* @throws IOException
*/
public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl)
throws IOException {
File f = new File(pathOrUrl);
if(f.isFile()) {
return iterator(WARCReaderFactory.get(f));
} else {
return iterator(WARCReaderFactory.get(pathOrUrl));
}
}
/**
代码示例来源:origin: org.netpreserve.openwayback/openwayback-core
/**
* @param pathOrUrl
* @return Iterator of SearchResults for input pathOrUrl
* @throws IOException
*/
public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl)
throws IOException {
File f = new File(pathOrUrl);
if(f.isFile()) {
return iterator(WARCReaderFactory.get(f));
} else {
return iterator(WARCReaderFactory.get(pathOrUrl));
}
}
/**
代码示例来源:origin: org.netpreserve.commons/commons-web
protected ArchiveReader getArchiveReader(final File f,
final long offset)
throws IOException {
if (ARCReaderFactory.isARCSuffix(f.getName())) {
return ARCReaderFactory.get(f, true, offset);
} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
return WARCReaderFactory.get(f, offset);
}
throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ f.getName());
}
代码示例来源:origin: iipc/webarchive-commons
protected ArchiveReader getArchiveReader(final File f,
final long offset)
throws IOException {
if (ARCReaderFactory.isARCSuffix(f.getName())) {
return ARCReaderFactory.get(f, true, offset);
} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
return WARCReaderFactory.get(f, offset);
}
throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ f.getName());
}
代码示例来源:origin: org.netpreserve.commons/webarchive-commons
protected ArchiveReader getArchiveReader(final File f,
final long offset)
throws IOException {
if (ARCReaderFactory.isARCSuffix(f.getName())) {
return ARCReaderFactory.get(f, true, offset);
} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
return WARCReaderFactory.get(f, offset);
}
throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ f.getName());
}
代码示例来源:origin: org.netpreserve.commons/webarchive-commons
protected ArchiveReader getArchiveReader(final String id,
final InputStream is, final boolean atFirstRecord)
throws IOException {
final InputStream stream = is;
if (ARCReaderFactory.isARCSuffix(id)) {
return ARCReaderFactory.get(id, stream, atFirstRecord);
} else if (WARCReaderFactory.isWARCSuffix(id)) {
return WARCReaderFactory.get(id, stream, atFirstRecord);
}
throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}
代码示例来源:origin: org.netpreserve.commons/commons-web
protected ArchiveReader getArchiveReader(final String id,
final InputStream is, final boolean atFirstRecord)
throws IOException {
final InputStream stream = is;
if (ARCReaderFactory.isARCSuffix(id)) {
return ARCReaderFactory.get(id, stream, atFirstRecord);
} else if (WARCReaderFactory.isWARCSuffix(id)) {
return WARCReaderFactory.get(id, stream, atFirstRecord);
}
throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}
代码示例来源:origin: iipc/webarchive-commons
protected ArchiveReader getArchiveReader(final String id,
final InputStream is, final boolean atFirstRecord)
throws IOException {
final InputStream stream = is;
if (ARCReaderFactory.isARCSuffix(id)) {
return ARCReaderFactory.get(id, stream, atFirstRecord);
} else if (WARCReaderFactory.isWARCSuffix(id)) {
return WARCReaderFactory.get(id, stream, atFirstRecord);
}
throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}
代码示例来源:origin: lintool/warcbase
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
reader = (WARCReader) WARCReaderFactory.get(split.getPath().toString(),
new BufferedInputStream(fileIn), true);
iter = reader.iterator();
//reader = (ARCReader) ARCReaderFactory.get(split.getPath().toString(), fileIn, true);
this.pos = start;
}
代码示例来源:origin: iipc/openwayback
public static Resource getResource(File file, long offset)
throws IOException, ResourceNotAvailableException {
Resource r = null;
String name = file.getName();
if (name.endsWith(ArcWarcFilenameFilter.OPEN_SUFFIX)) {
name = name.substring(0, name.length()
- ArcWarcFilenameFilter.OPEN_SUFFIX.length());
}
RandomAccessFile raf = new RandomAccessFile(file, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = file.getAbsolutePath();
if (isArc(name)) {
ArchiveReader reader = ARCReaderFactory.get(name, is, false);
r = ARCArchiveRecordToResource(reader.get(), reader);
} else if (isWarc(name)) {
ArchiveReader reader = WARCReaderFactory.get(name, is, false);
r = WARCArchiveRecordToResource(reader.get(), reader);
} else {
is.close();
raf.close();
throw new ResourceNotAvailableException("Unknown extension");
}
return r;
}
public static Resource getResource(URL url, long offset)
代码示例来源:origin: org.netpreserve.commons/webarchive-commons
/**
* Generate a CDX index file for an ARC file.
*
* @param urlOrPath The ARC file to generate a CDX index for
* @throws IOException
* @throws java.text.ParseException
*/
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
WARCReader r = WARCReaderFactory.get(urlOrPath);
r.setStrict(false);
r.setDigest(true);
output(r, CDX_FILE);
}
代码示例来源:origin: org.netpreserve.commons/commons-web
/**
* Generate a CDX index file for an ARC file.
*
* @param urlOrPath The ARC file to generate a CDX index for
* @throws IOException
* @throws java.text.ParseException
*/
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
WARCReader r = WARCReaderFactory.get(urlOrPath);
r.setStrict(false);
r.setDigest(true);
output(r, CDX_FILE);
}
代码示例来源:origin: iipc/webarchive-commons
/**
* Generate a CDX index file for an ARC file.
*
* @param urlOrPath The ARC file to generate a CDX index for
* @throws IOException
* @throws java.text.ParseException
*/
public static void createCDXIndexFile(String urlOrPath)
throws IOException, java.text.ParseException {
WARCReader r = WARCReaderFactory.get(urlOrPath);
r.setStrict(false);
r.setDigest(true);
output(r, CDX_FILE);
}
代码示例来源:origin: org.archive.heritrix/heritrix-commons
public void transform(final File warc, final File dir, final String prefix,
final String suffix, final boolean force)
throws IOException, java.text.ParseException {
FileUtils.assertReadable(warc);
FileUtils.assertReadable(dir);
WARCReader reader = WARCReaderFactory.get(warc);
List<String> metadata = new ArrayList<String>();
metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
this.getClass().getName() + "/" + getRevision());
ARCWriter writer =
new ARCWriter(
new AtomicInteger(),
new WriterPoolSettingsData(
prefix,
suffix,
-12,
reader.isCompressed(),
Arrays.asList(new File [] {dir}),
metadata));
transform(reader, writer);
}
内容来源于网络,如有侵权,请联系作者删除!