本文整理了Java中org.apache.spark.api.java.JavaPairRDD.groupByKey()
方法的一些代码示例,展示了JavaPairRDD.groupByKey()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.groupByKey()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:groupByKey
暂无
代码示例来源:origin: apache/hive
@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
if (numPartitions > 0) {
return input.groupByKey(numPartitions);
}
return input.groupByKey();
}
代码示例来源:origin: apache/drill
@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
if (numPartitions > 0) {
return input.groupByKey(numPartitions);
}
return input.groupByKey();
}
代码示例来源:origin: OryxProject/oryx
private static JavaPairRDD<String,Collection<String>> knownsRDD(JavaRDD<String[]> allData,
boolean knownItems) {
JavaRDD<String[]> sorted = allData.sortBy(datum -> Long.valueOf(datum[3]), true, allData.partitions().size());
JavaPairRDD<String,Tuple2<String,Boolean>> tuples = sorted.mapToPair(datum -> {
String user = datum[0];
String item = datum[1];
Boolean delete = datum[2].isEmpty();
return knownItems ?
new Tuple2<>(user, new Tuple2<>(item, delete)) :
new Tuple2<>(item, new Tuple2<>(user, delete));
});
// TODO likely need to figure out a way to avoid groupByKey but collectByKey
// won't work here -- doesn't guarantee enough about ordering
return tuples.groupByKey().mapValues(idDeletes -> {
Collection<String> ids = new HashSet<>();
for (Tuple2<String,Boolean> idDelete : idDeletes) {
if (idDelete._2()) {
ids.remove(idDelete._1());
} else {
ids.add(idDelete._1());
}
}
return ids;
});
}
代码示例来源:origin: OryxProject/oryx
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) {
return evalData.mapToPair(vector -> {
double closestDist = Double.POSITIVE_INFINITY;
int minClusterID = Integer.MIN_VALUE;
double[] vec = vector.toArray();
DistanceFn<double[]> distanceFn = getDistanceFn();
Map<Integer,ClusterInfo> clusters = getClustersByID();
for (ClusterInfo cluster : clusters.values()) {
double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
if (distance < closestDist) {
closestDist = distance;
minClusterID = cluster.getID();
}
}
Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
return new Tuple2<>(minClusterID, vec);
}).groupByKey();
}
代码示例来源:origin: org.apache.spark/spark-core_2.11
@SuppressWarnings("unchecked")
@Test
public void lookup() {
JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("Apples", "Fruit"),
new Tuple2<>("Oranges", "Fruit"),
new Tuple2<>("Oranges", "Citrus")
));
assertEquals(2, categories.lookup("Oranges").size());
assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}
代码示例来源:origin: org.apache.spark/spark-core
@SuppressWarnings("unchecked")
@Test
public void lookup() {
JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("Apples", "Fruit"),
new Tuple2<>("Oranges", "Fruit"),
new Tuple2<>("Oranges", "Citrus")
));
assertEquals(2, categories.lookup("Oranges").size());
assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
@SuppressWarnings("unchecked")
@Test
public void lookup() {
JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("Apples", "Fruit"),
new Tuple2<>("Oranges", "Fruit"),
new Tuple2<>("Oranges", "Citrus")
));
assertEquals(2, categories.lookup("Oranges").size());
assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}
代码示例来源:origin: OryxProject/oryx
}).groupByKey();
代码示例来源:origin: OryxProject/oryx
if (model.isImplicit()) {
aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {
代码示例来源:origin: apache/kylin
new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten));
JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey(
new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum()));
代码示例来源:origin: OryxProject/oryx
/**
* Combines {@link Rating}s with the same user/item into one, with score as the sum of
* all of the scores.
*/
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
if (implicit) {
// TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
// they don't guarantee the delete elements are properly handled
aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {
// For non-implicit, last wins.
aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
}
JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
aggregated.filter(kv -> !Double.isNaN(kv._2()));
if (logStrength) {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
Math.log1p(userProductScore._2() / epsilon)));
} else {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
userProductScore._2()));
}
}
代码示例来源:origin: OryxProject/oryx
positiveUserProducts.groupByKey().flatMapToPair(
new PairFlatMapFunction<Tuple2<Integer,Iterable<Integer>>,Integer,Integer>() {
private final RandomGenerator random = RandomManager.getRandom();
代码示例来源:origin: mahmoudparsian/data-algorithms-book
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts(
JavaSparkContext context,
JavaPairRDD<String, Long>[] ranks) {
JavaPairRDD<String, Long> unionRDD = context.union(ranks);
// next find unique keys, with their associated copa scores
JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey();
// next calculate ranked products and the number of elements
JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts =
groupedByGeneRDD.mapValues((Iterable<Long> values) -> {
int N = 0;
long products = 1;
for (Long v : values) {
products *= v;
N++;
}
double rankedProduct = Math.pow( (double) products, 1.0/((double) N));
return new Tuple2<Double, Integer>(rankedProduct, N);
} // input: copa scores for all studies
// output: (rankedProduct, N)
);
return rankedProducts;
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts(
JavaSparkContext context,
JavaPairRDD<String, Long>[] ranks) {
JavaPairRDD<String, Long> unionRDD = context.union(ranks);
// next find unique keys, with their associated copa scores
JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey();
// next calculate ranked products and the number of elements
JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues(
new Function<
Iterable<Long>, // input: copa scores for all studies
Tuple2<Double, Integer> // output: (rankedProduct, N)
>() {
@Override
public Tuple2<Double, Integer> call(Iterable<Long> values) {
int N = 0;
long products = 1;
for (Long v : values) {
products *= v;
N++;
}
double rankedProduct = Math.pow( (double) products, 1.0/((double) N));
return new Tuple2<Double, Integer>(rankedProduct, N);
}
});
return rankedProducts;
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaPairRDD<String, Iterable<Double>> groupedByGene = genes.groupByKey();
代码示例来源:origin: apache/tinkerpop
public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce(
final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce,
final Configuration graphComputerConfiguration) {
JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> {
KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
});
if (mapReduce.getReduceKeySort().isPresent())
reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);
return reduceRDD;
}
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaPairRDD<String, Iterable<String>> anagrams = rdd.groupByKey();
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids);
JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey();
Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup);
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaPairRDD<String, Iterable<Tuple2<Integer,String>>> groupedByWord = wordAsKey.groupByKey();
内容来源于网络,如有侵权,请联系作者删除!