本文整理了Java中org.apache.spark.api.java.JavaPairRDD.repartitionAndSortWithinPartitions()
方法的一些代码示例,展示了JavaPairRDD.repartitionAndSortWithinPartitions()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.repartitionAndSortWithinPartitions()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:repartitionAndSortWithinPartitions
暂无
代码示例来源:origin: apache/drill
@Override
public JavaPairRDD<HiveKey, BytesWritable> shuffle(
JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
if (numPartitions < 0) {
numPartitions = 1;
}
return input.repartitionAndSortWithinPartitions(new HashPartitioner(numPartitions));
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
rdd.repartitionAndSortWithinPartitions(partitioner);
assertTrue(repartitioned.partitioner().isPresent());
assertEquals(repartitioned.partitioner().get(), partitioner);
代码示例来源:origin: apache/kylin
hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys),
RowKeyWritable.RowKeyComparator.INSTANCE)
.mapToPair(new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
代码示例来源:origin: org.apache.spark/spark-core
rdd.repartitionAndSortWithinPartitions(partitioner);
assertTrue(repartitioned.partitioner().isPresent());
assertEquals(repartitioned.partitioner().get(), partitioner);
代码示例来源:origin: org.apache.spark/spark-core_2.11
rdd.repartitionAndSortWithinPartitions(partitioner);
assertTrue(repartitioned.partitioner().isPresent());
assertEquals(repartitioned.partitioner().get(), partitioner);
代码示例来源:origin: apache/drill
@Override
public JavaPairRDD<HiveKey, BytesWritable> shuffle(
JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
JavaPairRDD<HiveKey, BytesWritable> rdd;
if (totalOrder) {
if (numPartitions > 0) {
if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) {
input.persist(StorageLevel.DISK_ONLY());
sparkPlan.addCachedRDDId(input.id());
}
rdd = input.sortByKey(true, numPartitions);
} else {
rdd = input.sortByKey(true);
}
} else {
Partitioner partitioner = new HashPartitioner(numPartitions);
rdd = input.repartitionAndSortWithinPartitions(partitioner);
}
return rdd;
}
代码示例来源:origin: apache/hive
@Override
public JavaPairRDD<HiveKey, BytesWritable> shuffle(
JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
JavaPairRDD<HiveKey, BytesWritable> rdd;
if (totalOrder) {
if (numPartitions > 0) {
if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) {
input.persist(StorageLevel.DISK_ONLY());
sparkPlan.addCachedRDDId(input.id());
}
rdd = input.sortByKey(true, numPartitions);
} else {
rdd = input.sortByKey(true);
}
} else {
Partitioner partitioner = new HashPartitioner(numPartitions);
rdd = input.repartitionAndSortWithinPartitions(partitioner);
}
if (shuffleSerializer != null) {
if (rdd.rdd() instanceof ShuffledRDD) {
((ShuffledRDD) rdd.rdd()).setSerializer(shuffleSerializer);
}
}
return rdd;
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaPairRDD<Integer, Integer> repartitioned = rdd.repartitionAndSortWithinPartitions(partitioner);
List<List<Tuple2<Integer, Integer>>> partitions = repartitioned.glom().collect();
代码示例来源:origin: mahmoudparsian/data-algorithms-book
= valueToKey.repartitionAndSortWithinPartitions(new CustomPartitioner(partitions), TupleComparatorDescending.INSTANCE);
代码示例来源:origin: uk.gov.gchq.gaffer/spark-accumulo-library
@Override
protected void prepareKeyValues(final ImportKeyValueJavaPairRDDToAccumulo operation, final AccumuloKeyRangePartitioner partitioner) throws OperationException {
final JavaPairRDD<Key, Value> rdd = operation.getInput().repartitionAndSortWithinPartitions(partitioner);
rdd.saveAsNewAPIHadoopFile(operation.getOutputPath(), Key.class, Value.class, AccumuloFileOutputFormat.class, getConfiguration(operation));
}
代码示例来源:origin: org.apache.pig/pig
public static RDD<Tuple> handleSecondarySort(
RDD<Tuple2<IndexedKey, Tuple>> rdd, POPackage pkgOp) {
JavaPairRDD<IndexedKey, Tuple> pairRDD = JavaPairRDD.fromRDD(rdd, SparkUtil.getManifest(IndexedKey.class),
SparkUtil.getManifest(Tuple.class));
int partitionNums = pairRDD.partitions().size();
//repartition to group tuples with same indexedkey to same partition
JavaPairRDD<IndexedKey, Tuple> sorted = pairRDD.repartitionAndSortWithinPartitions(
new IndexedKeyPartitioner(partitionNums));
//Package tuples with same indexedkey as the result: (key,(val1,val2,val3,...))
return sorted.mapPartitions(new AccumulateByKey(pkgOp), true).rdd();
}
代码示例来源:origin: com.facebook.presto.hive/hive-apache
@Override
public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle(
JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
JavaPairRDD<HiveKey, BytesWritable> rdd;
if (totalOrder) {
if (numPartitions > 0) {
rdd = input.sortByKey(true, numPartitions);
} else {
rdd = input.sortByKey(true);
}
} else {
Partitioner partitioner = new HashPartitioner(numPartitions);
rdd = input.repartitionAndSortWithinPartitions(partitioner);
}
return rdd.mapPartitionsToPair(new ShuffleFunction());
}
代码示例来源:origin: org.apache.kylin/kylin-storage-hbase
hfilerdd.repartitionAndSortWithinPartitions(new HFilePartitioner(keys),
RowKeyWritable.RowKeyComparator.INSTANCE)
.mapToPair(new PairFunction<Tuple2<RowKeyWritable, KeyValue>, ImmutableBytesWritable, KeyValue>() {
代码示例来源:origin: org.apache.pig/pig
private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort(
RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) {
RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
SparkUtil.<Tuple, Object>getTuple2Manifest());
JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class));
//first sort the tuple by secondary key if enable useSecondaryKey sort
JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
new HashPartitioner(parallelism),
new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
JavaRDD<Tuple> jrdd = sorted.keys();
JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
return jrddPair;
}
代码示例来源:origin: org.apache.pig/pig
private JavaRDD<Tuple2<IndexedKey, Tuple>> handleSecondarySort(
RDD<Tuple> rdd, POReduceBySpark op, int parallelism) {
RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
SparkUtil.<Tuple, Object>getTuple2Manifest());
JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class));
//first sort the tuple by secondary key if enable useSecondaryKey sort
JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
new HashPartitioner(parallelism),
new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
JavaRDD<Tuple> jrdd = sorted.keys();
JavaRDD<Tuple2<IndexedKey, Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
return jrddPair;
}
代码示例来源:origin: seznam/euphoria
.union(rightPair)
.setName(operator.getName() + "::union-inputs")
.repartitionAndSortWithinPartitions(partitioner, comparator)
.setName(operator.getName() + "::sort-by-key-and-side")
.mapPartitions(
代码示例来源:origin: seznam/euphoria
.repartitionAndSortWithinPartitions(groupingPartitioner, comparator)
.setName(operator.getName() + "::sort");
代码示例来源:origin: seznam/euphoria
.mapToPair(t -> new Tuple2<>(new KeyedWindowValue<>(t._1, t._2), Empty.get()))
.setName(operator.getName() + "::create-composite-key")
.repartitionAndSortWithinPartitions(
partitioner, new SecondarySortComparator<>(operator.getValueComparator()))
.setName(operator.getName() + "::secondary-sort")
.repartitionAndSortWithinPartitions(partitioner)
.setName(operator.getName() + "::sort")
.mapPartitionsToPair(ReduceByKeyIterator::new)
内容来源于网络,如有侵权,请联系作者删除!