org.apache.spark.api.java.JavaPairRDD.mapToPair()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(13.2k)|赞(0)|评价(0)|浏览(99)

本文整理了Java中org.apache.spark.api.java.JavaPairRDD.mapToPair()方法的一些代码示例,展示了JavaPairRDD.mapToPair()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.mapToPair()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:mapToPair

JavaPairRDD.mapToPair介绍

暂无

代码示例

代码示例来源:origin: databricks/learning-spark

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
   throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
    }
  String master = args[0];
  String fileName = args[1];

    JavaSparkContext sc = new JavaSparkContext(
   master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  List<Tuple2<String, Integer>> input = new ArrayList();
  input.add(new Tuple2("coffee", 1));
  input.add(new Tuple2("coffee", 2));
  input.add(new Tuple2("pandas", 3));
  JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
  JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
  result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
  }
}

代码示例来源:origin: databricks/learning-spark

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
   throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
    }
  String master = args[0];
  String fileName = args[1];

    JavaSparkContext sc = new JavaSparkContext(
   master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
  JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
  List<Tuple2<String, Integer>> resultList = result.collect();
  for (Tuple2<String, Integer> record : resultList) {
   System.out.println(record);
  }
  }
}

代码示例来源:origin: apache/hive

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 result.setName(this.name);
 return result;
}

代码示例来源:origin: OryxProject/oryx

private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
  JavaPairRDD<String,float[]> javaRDD,
  Broadcast<? extends Map<String,Integer>> bIdToIndex) {
 RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
   new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
 ).mapValues(f -> {
   double[] d = new double[f.length];
   for (int i = 0; i < d.length; i++) {
    d[i] = f[i];
   }
   return d;
  }
 ).rdd();
 // This mimics the persistence level establish by ALS training methods
 scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
 return objKeyRDD;
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
 JavaPairRDD<Integer, Integer> rdd2 =
  rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 =
  rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 Assert.assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
 JavaPairRDD<Integer, Integer> rdd2 =
  rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 =
  rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 Assert.assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
 JavaPairRDD<Integer, Integer> rdd2 =
  rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 =
  rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 Assert.assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
  item -> Collections.singletonList(item.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.mapToPair(Tuple2::swap).collect();
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
  item -> Collections.singletonList(item.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.mapToPair(Tuple2::swap).collect();
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(
  item -> Collections.singletonList(item.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.mapToPair(Tuple2::swap).collect();
}

代码示例来源:origin: apache/drill

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 return result;
}

相关文章

微信公众号

最新文章

更多

JavaPairRDD类方法