org.apache.spark.api.java.JavaPairRDD.repartition()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(6.1k)|赞(0)|评价(0)|浏览(90)

本文整理了Java中org.apache.spark.api.java.JavaPairRDD.repartition()方法的一些代码示例,展示了JavaPairRDD.repartition()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.repartition()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:repartition

JavaPairRDD.repartition介绍

暂无

代码示例

代码示例来源:origin: apache/tinkerpop

loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
else if (loadedGraphRDD.partitions().size() < this.workers) // ensures that the loaded graphRDD does not have less partitions than workers
  loadedGraphRDD = loadedGraphRDD.repartition(this.workers);

代码示例来源:origin: com.davidbracewell/mango

@Override
public MPairStream<T, U> repartition(int partitions) {
 return new SparkPairStream<>(rdd.repartition(partitions));
}

代码示例来源:origin: org.rcsb/mmtf-spark

/**
 * Get a half cartesian - using the first part of the input RDD as a string key.
 * @param <K> the type of the value
 * @param <V> the type of the value
 * @param inputRDD the input rdd - keys are 
 * @param numPartitions the number of partitions to repartition
 * @return the RDD of all non-repated comparisons
 */
public static <K extends Comparable<K>,V> JavaPairRDD<Tuple2<K, V>, Tuple2<K, V>> getHalfCartesian(JavaPairRDD<K, V> inputRDD, int numPartitions) {
  JavaPairRDD<Tuple2<K, V>, Tuple2<K, V>> filteredRDD = inputRDD
      .cartesian(inputRDD)
      .filter(t -> t._1._1.compareTo(t._2._1)>0);
  // If there is partitions
  if (numPartitions!=0) {
    return filteredRDD.repartition(numPartitions);
  }
  // If not then just return it un re-partitoned
  return filteredRDD;
}

代码示例来源:origin: uber/hudi

private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
  JavaSparkContext jsc) {
 int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
 logger.info("Using cleanerParallelism: " + cleanerParallelism);
 List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
   .parallelize(partitionsToClean, cleanerParallelism)
   .flatMapToPair(getFilesToDeleteFunc(this, config))
   .repartition(cleanerParallelism)                    // repartition to remove skews
   .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
     // merge partition level clean stats below
     (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
       .merge(e2)).collect();
 Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
   .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
 HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
 // Return PartitionCleanStat for each partition passed.
 return partitionsToClean.stream().map(partitionPath -> {
  PartitionCleanStat partitionCleanStat =
    (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
      .get(partitionPath) : new PartitionCleanStat(partitionPath);
  return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
    .withPartitionPath(partitionPath)
    .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
    .withDeletePathPattern(partitionCleanStat.deletePathPatterns)
    .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
    .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
 }).collect(Collectors.toList());
}

代码示例来源:origin: com.uber.hoodie/hoodie-client

private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
  JavaSparkContext jsc) {
 int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
 logger.info("Using cleanerParallelism: " + cleanerParallelism);
 List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
   .parallelize(partitionsToClean, cleanerParallelism)
   .flatMapToPair(getFilesToDeleteFunc(this, config))
   .repartition(cleanerParallelism)                    // repartition to remove skews
   .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
     // merge partition level clean stats below
     (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
       .merge(e2)).collect();
 Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
   .collect(Collectors.toMap(e -> e._1(), e -> e._2()));
 HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
 // Return PartitionCleanStat for each partition passed.
 return partitionsToClean.stream().map(partitionPath -> {
  PartitionCleanStat partitionCleanStat =
    (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
      .get(partitionPath) : new PartitionCleanStat(partitionPath);
  return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
    .withPartitionPath(partitionPath)
    .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
    .withDeletePathPattern(partitionCleanStat.deletePathPatterns)
    .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
    .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
 }).collect(Collectors.toList());
}

代码示例来源:origin: org.apache.tinkerpop/spark-gremlin

loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
else if (loadedGraphRDD.partitions().size() < this.workers) // ensures that the loaded graphRDD does not have less partitions than workers
  loadedGraphRDD = loadedGraphRDD.repartition(this.workers);

代码示例来源:origin: ai.grakn/grakn-kb

loadedGraphRDD = loadedGraphRDD.repartition(this.workers);

代码示例来源:origin: cloudera-labs/envelope

private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) {
 return Contexts.getSparkSession().range(numPartitions).javaRDD()
   .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions);
}

代码示例来源:origin: org.apache.beam/beam-runners-spark

/** An implementation of {@link Reshuffle} for the Spark runner. */
 public static <K, V> JavaRDD<WindowedValue<KV<K, V>>> reshuffle(
   JavaRDD<WindowedValue<KV<K, V>>> rdd, Coder<K> keyCoder, WindowedValueCoder<V> wvCoder) {

  // Use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle.
  return rdd.map(new ReifyTimestampsAndWindowsFunction<>())
    .map(WindowingHelpers.unwindowFunction())
    .mapToPair(TranslationUtils.toPairFunction())
    .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder))
    .repartition(rdd.getNumPartitions())
    .mapToPair(CoderHelpers.fromByteFunction(keyCoder, wvCoder))
    .map(TranslationUtils.fromPairFunction())
    .map(TranslationUtils.toKVByWindowInValue());
 }
}

相关文章

微信公众号

最新文章

更多

JavaPairRDD类方法