org.apache.spark.api.java.JavaRDD.keyBy()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(8.9k)|赞(0)|评价(0)|浏览(169)

本文整理了Java中org.apache.spark.api.java.JavaRDD.keyBy()方法的一些代码示例,展示了JavaRDD.keyBy()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaRDD.keyBy()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaRDD
类名称:JavaRDD
方法名:keyBy

JavaRDD.keyBy介绍

暂无

代码示例

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void keyBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
 List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
 assertEquals(new Tuple2<>("1", 1), s.get(0));
 assertEquals(new Tuple2<>("2", 2), s.get(1));
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@Test
public void keyBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
 List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
 Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
 Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@Test
public void keyBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
 List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
 assertEquals(new Tuple2<>("1", 1), s.get(0));
 assertEquals(new Tuple2<>("2", 2), s.get(1));
}

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void keyBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
 List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
 Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
 Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@Test
public void keyBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
 List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
 Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
 Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@Test
public void keyBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2));
 List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect();
 assertEquals(new Tuple2<>("1", 1), s.get(0));
 assertEquals(new Tuple2<>("2", 2), s.get(1));
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@Test
public void combineByKey() {
 JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
 Function<Integer, Integer> keyFunction = v1 -> v1 % 3;
 Function<Integer, Integer> createCombinerFunction = v1 -> v1;
 Function2<Integer, Integer, Integer> mergeValueFunction = (v1, v2) -> v1 + v2;
 JavaPairRDD<Integer, Integer> combinedRDD = originalRDD.keyBy(keyFunction)
  .combineByKey(createCombinerFunction, mergeValueFunction, mergeValueFunction);
 Map<Integer, Integer> results = combinedRDD.collectAsMap();
 ImmutableMap<Integer, Integer> expected = ImmutableMap.of(0, 9, 1, 5, 2, 7);
 assertEquals(expected, results);
 Partitioner defaultPartitioner = Partitioner.defaultPartitioner(
  combinedRDD.rdd(),
  JavaConverters.collectionAsScalaIterableConverter(
   Collections.<RDD<?>>emptyList()).asScala().toSeq());
 combinedRDD = originalRDD.keyBy(keyFunction)
  .combineByKey(
   createCombinerFunction,
   mergeValueFunction,
   mergeValueFunction,
   defaultPartitioner,
   false,
   new KryoSerializer(new SparkConf()));
 results = combinedRDD.collectAsMap();
 assertEquals(expected, results);
}

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void combineByKey() {
 JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
 Function<Integer, Integer> keyFunction = v1 -> v1 % 3;
 Function<Integer, Integer> createCombinerFunction = v1 -> v1;
 Function2<Integer, Integer, Integer> mergeValueFunction = (v1, v2) -> v1 + v2;
 JavaPairRDD<Integer, Integer> combinedRDD = originalRDD.keyBy(keyFunction)
  .combineByKey(createCombinerFunction, mergeValueFunction, mergeValueFunction);
 Map<Integer, Integer> results = combinedRDD.collectAsMap();
 ImmutableMap<Integer, Integer> expected = ImmutableMap.of(0, 9, 1, 5, 2, 7);
 assertEquals(expected, results);
 Partitioner defaultPartitioner = Partitioner.defaultPartitioner(
  combinedRDD.rdd(),
  JavaConverters.collectionAsScalaIterableConverter(
   Collections.<RDD<?>>emptyList()).asScala().toSeq());
 combinedRDD = originalRDD.keyBy(keyFunction)
  .combineByKey(
   createCombinerFunction,
   mergeValueFunction,
   mergeValueFunction,
   defaultPartitioner,
   false,
   new KryoSerializer(new SparkConf()));
 results = combinedRDD.collectAsMap();
 assertEquals(expected, results);
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@Test
public void combineByKey() {
 JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
 Function<Integer, Integer> keyFunction = v1 -> v1 % 3;
 Function<Integer, Integer> createCombinerFunction = v1 -> v1;
 Function2<Integer, Integer, Integer> mergeValueFunction = (v1, v2) -> v1 + v2;
 JavaPairRDD<Integer, Integer> combinedRDD = originalRDD.keyBy(keyFunction)
  .combineByKey(createCombinerFunction, mergeValueFunction, mergeValueFunction);
 Map<Integer, Integer> results = combinedRDD.collectAsMap();
 ImmutableMap<Integer, Integer> expected = ImmutableMap.of(0, 9, 1, 5, 2, 7);
 assertEquals(expected, results);
 Partitioner defaultPartitioner = Partitioner.defaultPartitioner(
  combinedRDD.rdd(),
  JavaConverters.collectionAsScalaIterableConverter(
   Collections.<RDD<?>>emptyList()).asScala().toSeq());
 combinedRDD = originalRDD.keyBy(keyFunction)
  .combineByKey(
   createCombinerFunction,
   mergeValueFunction,
   mergeValueFunction,
   defaultPartitioner,
   false,
   new KryoSerializer(new SparkConf()));
 results = combinedRDD.collectAsMap();
 assertEquals(expected, results);
}

代码示例来源:origin: cloudera-labs/envelope

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 if (!dependencies.containsKey(intoDependency)) {
  throw new RuntimeException("Nest deriver points to non-existent nest-into dependency");
 }
 Dataset<Row> into = dependencies.get(intoDependency);
 if (!dependencies.containsKey(fromDependency)) {
  throw new RuntimeException("Nest deriver points to non-existent nest-from dependency");
 }
 Dataset<Row> from = dependencies.get(fromDependency);
 ExtractFieldsFunction extractFieldsFunction = new ExtractFieldsFunction(keyFieldNames);
 JavaPairRDD<List<Object>, Row> keyedIntoRDD = into.javaRDD().keyBy(extractFieldsFunction);
 JavaPairRDD<List<Object>, Row> keyedFromRDD = from.javaRDD().keyBy(extractFieldsFunction);
 NestFunction nestFunction = new NestFunction();
 JavaRDD<Row> nestedRDD = keyedIntoRDD.cogroup(keyedFromRDD).values().map(nestFunction);
 StructType nestedSchema = into.schema().add(nestedFieldName, DataTypes.createArrayType(from.schema()));
 Dataset<Row> nested = into.sqlContext().createDataFrame(nestedRDD, nestedSchema);
 return nested;
}

代码示例来源:origin: uk.gov.gchq.gaffer/parquet-store

isEntity, groupByColumns, columnToPaths, aggregatorJson, gafferGroupObjectConverter);
final JavaRDD<Row> aggregatedRDD = data.javaRDD()
    .keyBy(keyExtractor)
    .reduceByKey(aggregator)
    .values();

代码示例来源:origin: cloudera-labs/envelope

private JavaRDD<Row> planMutationsByKey(Dataset<Row> arriving, List<String> keyFieldNames,
                    Config plannerConfig, Config outputConfig) {
 JavaPairRDD<Row, Row> keyedArriving = 
   arriving.javaRDD().keyBy(new ExtractKeyFunction(keyFieldNames, accumulators));
 JavaPairRDD<Row, Iterable<Row>> arrivingByKey = 
   keyedArriving.groupByKey(getPartitioner(keyedArriving));
 JavaPairRDD<Row, Tuple2<Iterable<Row>, Iterable<Row>>> arrivingAndExistingByKey =
   arrivingByKey.mapPartitionsToPair(new JoinExistingForKeysFunction(outputConfig, keyFieldNames, accumulators));
 JavaRDD<Row> planned = 
   arrivingAndExistingByKey.flatMap(new PlanForKeyFunction(plannerConfig, accumulators));
 return planned;
}

代码示例来源:origin: cloudera-labs/envelope

private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) {
 return Contexts.getSparkSession().range(numPartitions).javaRDD()
   .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions);
}

代码示例来源:origin: org.qcri.rheem/rheem-iejoin

.keyBy(new extractData<Type0, Type1, Input>(get0Pivot_, get0Ref_)).sortByKey(new Data.Comparator<Type0, Type1>(list1ASC, list1ASCSec));
JavaPairRDD<Data<Type0, Type1>, Tuple2<Long, Input>> keyedDataRDD2 = inputRDD2UID
    .keyBy(new extractData<Type0, Type1, Input>(get1Pivot_, get1Ref_)).sortByKey(new Data.Comparator<Type0, Type1>(list2ASC, list2ASCSec));
    .keyBy(in -> in.getPartitionID());
JavaPairRDD<Long, List2AttributesObjectSkinny<Type0, Type1>> listObjectDataRDD2Indexd = listObjectDataRDD2
    .keyBy(in -> in.getPartitionID());
r1RowIDS = inputRDD1UID.keyBy(in -> in._1());
r2RowIDS = inputRDD2UID.keyBy(in -> in._1());

代码示例来源:origin: org.qcri.rheem/rheem-iejoin

.keyBy(new extractData<Type0, Type1, Input>(get0Pivot_, get0Ref_)).sortByKey(new Data.Comparator<Type0, Type1>(list1ASC, list1ASCSec));
    .keyBy(input -> input.getPartitionID());
r1RowIDS = inputRDD1UID.keyBy(in -> in._1());
r2RowIDS = r1RowIDS;

相关文章

微信公众号

最新文章

更多