org.apache.spark.api.java.JavaPairRDD.groupByKey()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(9.2k)|赞(0)|评价(0)|浏览(81)

本文整理了Java中org.apache.spark.api.java.JavaPairRDD.groupByKey()方法的一些代码示例,展示了JavaPairRDD.groupByKey()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.groupByKey()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:groupByKey

JavaPairRDD.groupByKey介绍

暂无

代码示例

代码示例来源:origin: apache/hive

@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
  JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
 if (numPartitions > 0) {
  return input.groupByKey(numPartitions);
 }
 return input.groupByKey();
}

代码示例来源:origin: apache/drill

@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
  JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
 if (numPartitions > 0) {
  return input.groupByKey(numPartitions);
 }
 return input.groupByKey();
}

代码示例来源:origin: OryxProject/oryx

private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
                                boolean knownItems) {
 JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());
 JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
   String user = datum[0];
   String item = datum[1];
   Boolean delete = datum[2].isEmpty();
   return knownItems ?
     new Tuple2<>(user, new Tuple2<>(item, delete)) :
     new Tuple2<>(item, new Tuple2<>(user, delete));
  });
 // TODO likely need to figure out a way to avoid groupByKey but collectByKey
 // won't work here -- doesn't guarantee enough about ordering
 return tuples.groupByKey().mapValues(idDeletes -> {
   Collection<String> ids = new HashSet<>();
   for (Tuple2<String,Boolean> idDelete : idDeletes) {
    if (idDelete._2()) {
     ids.remove(idDelete._1());
    } else {
     ids.add(idDelete._1());
    }
   }
   return ids;
  });
}

代码示例来源:origin: OryxProject/oryx

private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) {
 return evalData.mapToPair(vector -> {
  double closestDist = Double.POSITIVE_INFINITY;
  int minClusterID = Integer.MIN_VALUE;
  double[] vec = vector.toArray();
  DistanceFn<double[]> distanceFn = getDistanceFn();
  Map<Integer,ClusterInfo> clusters = getClustersByID();
  for (ClusterInfo cluster : clusters.values()) {
   double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
   if (distance < closestDist) {
    closestDist = distance;
    minClusterID = cluster.getID();
   }
  }
  Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
  return new Tuple2<>(minClusterID, vec);
 }).groupByKey();
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

代码示例来源:origin: org.apache.spark/spark-core

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

代码示例来源:origin: OryxProject/oryx

}).groupByKey();

代码示例来源:origin: OryxProject/oryx

if (model.isImplicit()) {
 aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {

代码示例来源:origin: apache/kylin

new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten));
JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey(
    new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum()));

代码示例来源:origin: OryxProject/oryx

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
   original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
 if (implicit) {
  // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
  // they don't guarantee the delete elements are properly handled
  aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
 } else {
  // For non-implicit, last wins.
  aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
 }
 JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
   aggregated.filter(kv -> !Double.isNaN(kv._2()));
 if (logStrength) {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    Math.log1p(userProductScore._2() / epsilon)));
 } else {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    userProductScore._2()));
 }
}

代码示例来源:origin: OryxProject/oryx

positiveUserProducts.groupByKey().flatMapToPair(
 new PairFlatMapFunction<Tuple2<Integer,Iterable<Integer>>,Integer,Integer>() {
  private final RandomGenerator random = RandomManager.getRandom();

代码示例来源:origin: mahmoudparsian/data-algorithms-book

static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts(
    JavaSparkContext context,
    JavaPairRDD<String, Long>[] ranks) {
  JavaPairRDD<String, Long> unionRDD = context.union(ranks);
  
  // next find unique keys, with their associated copa scores
  JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey();
  
  // next calculate ranked products and the number of elements
  JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = 
      groupedByGeneRDD.mapValues((Iterable<Long> values) -> {
    int N = 0;
    long products = 1;
    for (Long v : values) {
      products *= v;
      N++;
    }
    
    double rankedProduct = Math.pow( (double) products, 1.0/((double) N));
    return new Tuple2<Double, Integer>(rankedProduct, N);
  } // input: copa scores for all studies
  // output: (rankedProduct, N)
  );                
  return rankedProducts;
}

代码示例来源:origin: mahmoudparsian/data-algorithms-book

static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts(
    JavaSparkContext context,
    JavaPairRDD<String, Long>[] ranks) {
  JavaPairRDD<String, Long> unionRDD = context.union(ranks);
  
  // next find unique keys, with their associated copa scores
  JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey();
  
  // next calculate ranked products and the number of elements
  JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues(
      new Function<
             Iterable<Long>,            // input: copa scores for all studies
             Tuple2<Double, Integer>    // output: (rankedProduct, N)
            >() {
      @Override
      public Tuple2<Double, Integer> call(Iterable<Long> values) {
        int N = 0;
        long products = 1;
        for (Long v : values) {
          products *= v;
          N++;
        }
        
        double rankedProduct = Math.pow( (double) products, 1.0/((double) N));
        return new Tuple2<Double, Integer>(rankedProduct, N);
      }
  });                
  return rankedProducts;
}

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();

代码示例来源:origin: apache/tinkerpop

public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
      final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
      final Configuration graphComputerConfiguration) {
    JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
      KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
      return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getReduceKeySort().isPresent())
      reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);
    return reduceRDD;
  }
}

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaPairRDD<String, Iterable<String>> anagrams = rdd.groupByKey();

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids);
JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey();
Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup);

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaPairRDD<String, Iterable<Tuple2<Integer,String>>> groupedByWord = wordAsKey.groupByKey();

相关文章

微信公众号

最新文章

更多

JavaPairRDD类方法