org.apache.spark.api.java.JavaPairRDD.saveAsHadoopFile()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(15.3k)|赞(0)|评价(0)|浏览(85)

本文整理了Java中org.apache.spark.api.java.JavaPairRDD.saveAsHadoopFile()方法的一些代码示例,展示了JavaPairRDD.saveAsHadoopFile()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.saveAsHadoopFile()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:saveAsHadoopFile

JavaPairRDD.saveAsHadoopFile介绍

暂无

代码示例

代码示例来源:origin: databricks/learning-spark

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
   throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
    }
  String master = args[0];
  String fileName = args[1];

    JavaSparkContext sc = new JavaSparkContext(
   master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  List<Tuple2<String, Integer>> input = new ArrayList();
  input.add(new Tuple2("coffee", 1));
  input.add(new Tuple2("coffee", 2));
  input.add(new Tuple2("pandas", 3));
  JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
  JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
  result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
  }
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void sequenceFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 // Try reading the output back as an object file
 JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class,
  Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
 assertEquals(pairs, readRDD.collect());
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void sequenceFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 // Try reading the output back as an object file
 JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class,
  Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
 assertEquals(pairs, readRDD.collect());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void sequenceFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 // Try reading the output back as an object file
 JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class,
  Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
 assertEquals(pairs, readRDD.collect());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@Test
public void sequenceFile() {
 File tempDir = Files.createTempDir();
 tempDir.deleteOnExit();
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 // Try reading the output back as an object file
 JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
  .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
 Assert.assertEquals(pairs, readRDD.collect());
 Utils.deleteRecursively(tempDir);
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@Test
public void sequenceFile() {
 File tempDir = Files.createTempDir();
 tempDir.deleteOnExit();
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 // Try reading the output back as an object file
 JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
  .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
 Assert.assertEquals(pairs, readRDD.collect());
 Utils.deleteRecursively(tempDir);
}

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void sequenceFile() {
 File tempDir = Files.createTempDir();
 tempDir.deleteOnExit();
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 // Try reading the output back as an object file
 JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
  .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
 Assert.assertEquals(pairs, readRDD.collect());
 Utils.deleteRecursively(tempDir);
}

代码示例来源:origin: mahmoudparsian/data-algorithms-book

ptRDD.saveAsHadoopFile("/naivebayes/pt",              // name of path
          PairOfStrings.class,              // key class
          DoubleWritable.class,             // value class

代码示例来源:origin: mahmoudparsian/data-algorithms-book

ptRDD.saveAsHadoopFile("/naivebayes/pt",              // name of path
          PairOfStrings.class,              // key class
          DoubleWritable.class,             // value class

代码示例来源:origin: org.rcsb/mmtf-spark

/**
 * Join two RDDs together into one larger one and save it as a new file
 * @param datasetOne the first {@link JavaPairRDD} to join 
 * @param datasetTwo the second {@link JavaPairRDD} to join
 * @param outputUri the output URI to write to
 */
public static void joinAndSave(JavaPairRDD<Text, BytesWritable> datasetOne,
    JavaPairRDD<Text, BytesWritable> datasetTwo, String outputUri) {
  SparkUtils.joinDatasets(datasetOne, datasetTwo).
  saveAsHadoopFile(outputUri, Text.class, BytesWritable.class, SequenceFileOutputFormat.class);		}

代码示例来源:origin: rathboma/hadoop-framework-examples

public static void main(String[] args) throws Exception {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJoins").setMaster("local"));
    ExampleJob job = new ExampleJob(sc);
    JavaPairRDD<String, String> output_rdd = job.run(args[0], args[1]);
    output_rdd.saveAsHadoopFile(args[2], String.class, String.class, TextOutputFormat.class);
    sc.close();
  }
}

相关文章

微信公众号

最新文章

更多

JavaPairRDD类方法