org.apache.spark.api.java.JavaPairRDD.keys()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(7.0k)|赞(0)|评价(0)|浏览(77)

本文整理了Java中org.apache.spark.api.java.JavaPairRDD.keys()方法的一些代码示例,展示了JavaPairRDD.keys()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.keys()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:keys

JavaPairRDD.keys介绍

暂无

代码示例

代码示例来源:origin: OryxProject/oryx

private static void addIDsExtension(PMML pmml,
                  String key,
                  JavaPairRDD<Integer,?> features,
                  Map<Integer,String> indexToID) {
 List<String> ids = features.keys().collect().stream().map(indexToID::get).collect(Collectors.toList());
 AppPMMLUtils.addExtensionContent(pmml, key, ids);
}

代码示例来源:origin: databricks/learning-spark

public static final JavaRDD<String> filterIPAddress(
  JavaPairRDD<String, Long> ipAddressCount) {
 return ipAddressCount
  .filter(new IpCountGreaterThan10())
  .keys();
}

代码示例来源:origin: OryxProject/oryx

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
   testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,Object>> testUserProducts =
   (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
 JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
 double mse = predictions.mapToPair(
   rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
 ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
  double diff = valuePrediction._1() - valuePrediction._2();
  return diff * diff;
 }).mean();
 return Math.sqrt(mse);
}

代码示例来源:origin: SeldonIO/seldon-server

List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {

代码示例来源:origin: com.davidbracewell/mango

@Override
public MStream<T> keys() {
 return new SparkStream<>(rdd.keys());
}

代码示例来源:origin: uber/hudi

@Override
 public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
   Optional<String> lastCheckpointStr, long sourceLimit) {
  try {
   // find the source commit to pull
   Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

   if (!commitToPull.isPresent()) {
    return new ImmutablePair<>(Optional.empty(),
        lastCheckpointStr.orElse(""));
   }

   // read the files out.
   List<FileStatus> commitDeltaFiles = Arrays.asList(
     fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
   String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
     .collect(Collectors.joining(","));
   JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
     AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
     sparkContext.hadoopConfiguration());
   return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
     String.valueOf(commitToPull.get()));
  } catch (IOException ioe) {
   throw new HoodieIOException(
     "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
  }
 }
}

代码示例来源:origin: com.uber.hoodie/hoodie-utilities

@Override
 public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
   Optional<String> lastCheckpointStr, long sourceLimit) {
  try {
   // find the source commit to pull
   Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

   if (!commitToPull.isPresent()) {
    return new ImmutablePair<>(Optional.empty(),
      lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
   }

   // read the files out.
   List<FileStatus> commitDeltaFiles = Arrays.asList(
     fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
   String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
     .collect(Collectors.joining(","));
   JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
     AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
     sparkContext.hadoopConfiguration());
   return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
     String.valueOf(commitToPull.get()));
  } catch (IOException ioe) {
   throw new HoodieIOException(
     "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
  }
 }
}

代码示例来源:origin: com.uber.hoodie/hoodie-utilities

@Override
 protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
  JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
    AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
    sparkContext.hadoopConfiguration());
  return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
 }
}

代码示例来源:origin: uber/hudi

@Override
 protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
  JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
    AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
    sparkContext.hadoopConfiguration());
  return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
 }
}

代码示例来源:origin: org.apache.pig/pig

private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort(
    RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) {
  RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
      SparkUtil.<Tuple, Object>getTuple2Manifest());
  JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
      SparkUtil.getManifest(Tuple.class),
      SparkUtil.getManifest(Object.class));
  //first sort the tuple by secondary key if enable useSecondaryKey sort
  JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
      new HashPartitioner(parallelism),
      new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
  JavaRDD<Tuple> jrdd = sorted.keys();
  JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
  return jrddPair;
}

代码示例来源:origin: org.apache.pig/pig

private JavaRDD<Tuple2<IndexedKey, Tuple>> handleSecondarySort(
    RDD<Tuple> rdd, POReduceBySpark op, int parallelism) {
  RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
      SparkUtil.<Tuple, Object>getTuple2Manifest());
  JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
      SparkUtil.getManifest(Tuple.class),
      SparkUtil.getManifest(Object.class));
  //first sort the tuple by secondary key if enable useSecondaryKey sort
  JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
      new HashPartitioner(parallelism),
      new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
  JavaRDD<Tuple> jrdd = sorted.keys();
  JavaRDD<Tuple2<IndexedKey, Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
  return jrddPair;
}

代码示例来源:origin: locationtech/geowave

leftIndex.setName("LeftIndex").keys().map(t -> t.getBytes()[0]).distinct(4).collectAsync();
final JavaFutureAction<List<Byte>> rightFuture =
  rightIndex.setName("RightIndex").keys().map(t -> t.getBytes()[0]).distinct(
    4).collectAsync();

相关文章

微信公众号

最新文章

更多

JavaPairRDD类方法