本文整理了Java中org.apache.spark.api.java.JavaPairRDD.keys()
方法的一些代码示例,展示了JavaPairRDD.keys()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.keys()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:keys
暂无
代码示例来源:origin: OryxProject/oryx
private static void addIDsExtension(PMML pmml,
String key,
JavaPairRDD<Integer,?> features,
Map<Integer,String> indexToID) {
List<String> ids = features.keys().collect().stream().map(indexToID::get).collect(Collectors.toList());
AppPMMLUtils.addExtensionContent(pmml, key, ids);
}
代码示例来源:origin: databricks/learning-spark
public static final JavaRDD<String> filterIPAddress(
JavaPairRDD<String, Long> ipAddressCount) {
return ipAddressCount
.filter(new IpCountGreaterThan10())
.keys();
}
代码示例来源:origin: OryxProject/oryx
/**
* Computes root mean squared error of {@link Rating#rating()} versus predicted value.
*/
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
@SuppressWarnings("unchecked")
RDD<Tuple2<Object,Object>> testUserProducts =
(RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
double mse = predictions.mapToPair(
rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
double diff = valuePrediction._1() - valuePrediction._2();
return diff * diff;
}).mean();
return Math.sqrt(mse);
}
代码示例来源:origin: SeldonIO/seldon-server
List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {
代码示例来源:origin: com.davidbracewell/mango
@Override
public MStream<T> keys() {
return new SparkStream<>(rdd.keys());
}
代码示例来源:origin: uber/hudi
@Override
public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
Optional<String> lastCheckpointStr, long sourceLimit) {
try {
// find the source commit to pull
Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);
if (!commitToPull.isPresent()) {
return new ImmutablePair<>(Optional.empty(),
lastCheckpointStr.orElse(""));
}
// read the files out.
List<FileStatus> commitDeltaFiles = Arrays.asList(
fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
.collect(Collectors.joining(","));
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
sparkContext.hadoopConfiguration());
return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
String.valueOf(commitToPull.get()));
} catch (IOException ioe) {
throw new HoodieIOException(
"Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
}
代码示例来源:origin: com.uber.hoodie/hoodie-utilities
@Override
public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
Optional<String> lastCheckpointStr, long sourceLimit) {
try {
// find the source commit to pull
Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);
if (!commitToPull.isPresent()) {
return new ImmutablePair<>(Optional.empty(),
lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
}
// read the files out.
List<FileStatus> commitDeltaFiles = Arrays.asList(
fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
.collect(Collectors.joining(","));
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
sparkContext.hadoopConfiguration());
return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
String.valueOf(commitToPull.get()));
} catch (IOException ioe) {
throw new HoodieIOException(
"Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
}
代码示例来源:origin: com.uber.hoodie/hoodie-utilities
@Override
protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
sparkContext.hadoopConfiguration());
return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
}
}
代码示例来源:origin: uber/hudi
@Override
protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
sparkContext.hadoopConfiguration());
return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
}
}
代码示例来源:origin: org.apache.pig/pig
private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort(
RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) {
RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
SparkUtil.<Tuple, Object>getTuple2Manifest());
JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class));
//first sort the tuple by secondary key if enable useSecondaryKey sort
JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
new HashPartitioner(parallelism),
new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
JavaRDD<Tuple> jrdd = sorted.keys();
JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
return jrddPair;
}
代码示例来源:origin: org.apache.pig/pig
private JavaRDD<Tuple2<IndexedKey, Tuple>> handleSecondarySort(
RDD<Tuple> rdd, POReduceBySpark op, int parallelism) {
RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
SparkUtil.<Tuple, Object>getTuple2Manifest());
JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
SparkUtil.getManifest(Tuple.class),
SparkUtil.getManifest(Object.class));
//first sort the tuple by secondary key if enable useSecondaryKey sort
JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
new HashPartitioner(parallelism),
new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
JavaRDD<Tuple> jrdd = sorted.keys();
JavaRDD<Tuple2<IndexedKey, Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
return jrddPair;
}
代码示例来源:origin: locationtech/geowave
leftIndex.setName("LeftIndex").keys().map(t -> t.getBytes()[0]).distinct(4).collectAsync();
final JavaFutureAction<List<Byte>> rightFuture =
rightIndex.setName("RightIndex").keys().map(t -> t.getBytes()[0]).distinct(
4).collectAsync();
内容来源于网络,如有侵权,请联系作者删除!