本文整理了Java中org.apache.spark.api.java.JavaSparkContext.sequenceFile()
方法的一些代码示例,展示了JavaSparkContext.sequenceFile()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaSparkContext.sequenceFile()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaSparkContext
类名称:JavaSparkContext
方法名:sequenceFile
暂无
代码示例来源:origin: apache/kylin
/**
* Read the given path as a Java RDD; The path can have second level sub folder.
* @param inputPath
* @param fs
* @param sc
* @param keyClass
* @param valueClass
* @return
* @throws IOException
*/
public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass,
Class valueClass) throws IOException {
List<String> inputFolders = Lists.newArrayList();
Path inputHDFSPath = new Path(inputPath);
FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
boolean hasDir = false;
for (FileStatus stat : fileStatuses) {
if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
hasDir = true;
inputFolders.add(stat.getPath().toString());
}
}
if (!hasDir) {
return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass);
}
return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass);
}
代码示例来源:origin: apache/kylin
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) {
return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values()
.map(new Function<Text, String[]>() {
@Override
public String[] call(Text text) throws Exception {
String s = Bytes.toString(text.getBytes(), 0, text.getLength());
return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1);
}
});
}
代码示例来源:origin: databricks/learning-spark
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
}
String master = args[0];
String fileName = args[1];
JavaSparkContext sc = new JavaSparkContext(
master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
List<Tuple2<String, Integer>> resultList = result.collect();
for (Tuple2<String, Integer> record : resultList) {
System.out.println(record);
}
}
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
JavaPairRDD<IntWritable, Text> output =
sc.sequenceFile(outputDir, IntWritable.class, Text.class);
assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
@SuppressWarnings("unchecked")
@Test
public void sequenceFile() {
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class,
Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
assertEquals(pairs, readRDD.collect());
}
代码示例来源:origin: org.apache.spark/spark-core
@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
JavaPairRDD<IntWritable, Text> output =
sc.sequenceFile(outputDir, IntWritable.class, Text.class);
assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}
代码示例来源:origin: org.apache.spark/spark-core_2.11
@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
JavaPairRDD<IntWritable, Text> output =
sc.sequenceFile(outputDir, IntWritable.class, Text.class);
assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}
代码示例来源:origin: org.apache.spark/spark-core_2.11
@Test
public void sequenceFile() {
File tempDir = Files.createTempDir();
tempDir.deleteOnExit();
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
.mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
Assert.assertEquals(pairs, readRDD.collect());
Utils.deleteRecursively(tempDir);
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
@Test
public void sequenceFile() {
File tempDir = Files.createTempDir();
tempDir.deleteOnExit();
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
.mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
Assert.assertEquals(pairs, readRDD.collect());
Utils.deleteRecursively(tempDir);
}
代码示例来源:origin: org.apache.spark/spark-core_2.11
@SuppressWarnings("unchecked")
@Test
public void sequenceFile() {
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class,
Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
assertEquals(pairs, readRDD.collect());
}
代码示例来源:origin: org.apache.spark/spark-core
@SuppressWarnings("unchecked")
@Test
public void sequenceFile() {
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class,
Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
assertEquals(pairs, readRDD.collect());
}
代码示例来源:origin: org.apache.spark/spark-core
@Test
public void sequenceFile() {
File tempDir = Files.createTempDir();
tempDir.deleteOnExit();
String outputDir = new File(tempDir, "output").getAbsolutePath();
List<Tuple2<Integer, String>> pairs = Arrays.asList(
new Tuple2<>(1, "a"),
new Tuple2<>(2, "aa"),
new Tuple2<>(3, "aaa")
);
JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
.mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
Assert.assertEquals(pairs, readRDD.collect());
Utils.deleteRecursively(tempDir);
}
代码示例来源:origin: apache/kylin
CubeSegment sourceSegment = findSourceSegment(path, cubeInstance);
final String cuboidInputPath = BatchCubingJobBuilder2.getCuboidOutputPathsByLevel(path, level);
JavaPairRDD<Text, Text> segRdd = sc.sequenceFile(cuboidInputPath, Text.class, Text.class);
代码示例来源:origin: stackoverflow.com
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<LongWritable, Vector> input =
sc.sequenceFile(fileName, LongWritable.class, Vector.class);
代码示例来源:origin: org.rcsb/mmtf-spark
/**
* Read a hadoop sequence file to a Strign byte[] pair.
* @param filePath the input path of the hadoop file
* @return the {@link JavaPairRDD} of the PDB code and the byte array of the data.
*/
public static JavaPairRDD<String, byte[]> readHadoopFile(String filePath) {
return getSparkContext()
.sequenceFile(filePath, Text.class, BytesWritable.class, 8)
.mapToPair(t -> new Tuple2<String, byte[]>(t._1.toString(),t._2.getBytes()));
}
代码示例来源:origin: org.apache.kylin/kylin-engine-spark
/**
* Read the given path as a Java RDD; The path can have second level sub folder.
* @param inputPath
* @param fs
* @param sc
* @param keyClass
* @param valueClass
* @return
* @throws IOException
*/
public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass,
Class valueClass) throws IOException {
List<String> inputFolders = Lists.newArrayList();
Path inputHDFSPath = new Path(inputPath);
FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
boolean hasDir = false;
for (FileStatus stat : fileStatuses) {
if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
hasDir = true;
inputFolders.add(stat.getPath().toString());
}
}
if (!hasDir) {
return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass);
}
return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass);
}
代码示例来源:origin: org.apache.kylin/kylin-engine-spark
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) {
return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values()
.map(new Function<Text, String[]>() {
@Override
public String[] call(Text text) throws Exception {
String s = Bytes.toString(text.getBytes(), 0, text.getLength());
return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER);
}
});
}
代码示例来源:origin: deepspark/deepspark
public static void main(String[] args) {
// TODO Auto-generated method stub
SparkConf conf = new SparkConf().setAppName("ImagenetSampler")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
JavaSparkContext sc = new JavaSparkContext(conf);
int numExecutors = conf.getInt("spark.executor.instances", -1);
System.out.println("number of executors = " + numExecutors);
System.out.println("Data Loading...");
JavaPairRDD<FloatWritable, ArrayPrimitiveWritable> train_seq =
sc.sequenceFile("imagenet_sampled.hsf", FloatWritable.class, ArrayPrimitiveWritable.class);
train_seq.foreach(new VoidFunction<Tuple2<FloatWritable,ArrayPrimitiveWritable>>() {
@Override
public void call(Tuple2<FloatWritable, ArrayPrimitiveWritable> arg0) throws Exception {
System.out.println(arg0._1.get() + " " + ((float[]) arg0._2.get()).length);
}
});
sc.close();
}
代码示例来源:origin: deepspark/deepspark
sc.sequenceFile("/imagenet_train.hsf", FloatWritable.class, ArrayPrimitiveWritable.class);
代码示例来源:origin: usc-isi-i2/Web-Karma
JavaPairRDD<Writable, Text> input = sc.sequenceFile(filePath, Writable.class, Text.class, partitions);
pairs = input.mapToPair(new PairFunction<Tuple2<Writable, Text>, String, String>() {
private static final long serialVersionUID = -9042224661662821670L;
内容来源于网络,如有侵权,请联系作者删除!