本文整理了Java中org.apache.spark.api.java.JavaRDD.distinct()
方法的一些代码示例,展示了JavaRDD.distinct()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaRDD.distinct()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaRDD
类名称:JavaRDD
方法名:distinct
暂无
代码示例来源:origin: OryxProject/oryx
private static Map<String,Integer> buildIDIndexOneWayMap(PMML model,
JavaRDD<String[]> parsedTestRDD,
boolean user) {
// Add to mapping everything from the model
List<String> ids = AppPMMLUtils.getExtensionContent(model, user ? "XIDs" : "YIDs");
Map<String,Integer> idIndex = new HashMap<>(ids.size());
int index = 0;
for (String id : ids) {
idIndex.put(id, index++);
}
// And from the test set, which may have a few more IDs
int offset = user ? 0 : 1;
for (String id : parsedTestRDD.map(tokens -> tokens[offset]).distinct().collect()) {
if (!idIndex.containsKey(id)) {
idIndex.put(id, index++);
}
}
return idIndex;
}
代码示例来源:origin: OryxProject/oryx
private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD,
boolean user) {
int offset = user ? 0 : 1;
Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset])
.distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions())
.zipWithIndex().mapValues(Long::intValue)
.collectAsMap();
// Clone, due to some serialization problems with the result of collectAsMap?
return new HashMap<>(reverseIDLookup);
}
代码示例来源:origin: SeldonIO/seldon-server
List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {
代码示例来源:origin: OryxProject/oryx
Broadcast<List<Integer>> allItemIDsBC = sparkContext.broadcast(positiveUserProducts.values().distinct().collect());
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaRDD<Tuple3<Long,Long,Long>> uniqueTriangles = trianglesWithDuplicates.distinct();
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaRDD<Tuple3<Long,Long,Long>> uniqueTriangles = trianglesWithDuplicates.distinct();
代码示例来源:origin: locationtech/geowave
private JavaRDD<Tuple2<GeoWaveInputKey, Geometry>> prepareForReproject(
final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexedRDD,
final int numPartitions) {
return indexedRDD.values().distinct(numPartitions);
}
代码示例来源:origin: com.davidbracewell/mango
@Override
public SparkStream<T> distinct() {
return new SparkStream<>(rdd.distinct());
}
代码示例来源:origin: rathboma/hadoop-framework-examples
public static JavaRDD<Tuple2<Integer,Optional<String>>> joinData(JavaPairRDD<Integer, Integer> t, JavaPairRDD<Integer, String> u){
JavaRDD<Tuple2<Integer,Optional<String>>> leftJoinOutput = t.leftOuterJoin(u).values().distinct();
return leftJoinOutput;
}
代码示例来源:origin: org.datavec/datavec-spark_2.11
/**
* Get a list of unique values from the specified column.
* For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)}
*
* @param columnName Name of the column to get unique values from
* @param schema Data schema
* @param data Data to get unique values from
* @return List of unique values
*/
public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) {
int colIdx = schema.getIndexOfColumn(columnName);
JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx));
return ithColumn.distinct().collect();
}
代码示例来源:origin: org.datavec/datavec-spark
/**
* Get a list of unique values from the specified column.
* For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)}
*
* @param columnName Name of the column to get unique values from
* @param schema Data schema
* @param data Data to get unique values from
* @return List of unique values
*/
public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) {
int colIdx = schema.getIndexOfColumn(columnName);
JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx));
return ithColumn.distinct().collect();
}
代码示例来源:origin: org.qcri.rheem/rheem-spark
@Override
public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate(
ChannelInstance[] inputs,
ChannelInstance[] outputs,
SparkExecutor sparkExecutor,
OptimizationContext.OperatorContext operatorContext) {
assert inputs.length == this.getNumInputs();
assert outputs.length == this.getNumOutputs();
final RddChannel.Instance input = (RddChannel.Instance) inputs[0];
final RddChannel.Instance output = (RddChannel.Instance) outputs[0];
final JavaRDD<Type> inputRdd = input.provideRdd();
final JavaRDD<Type> outputRdd = inputRdd.distinct(sparkExecutor.getNumDefaultPartitions());
this.name(outputRdd);
output.accept(outputRdd, sparkExecutor);
return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext);
}
代码示例来源:origin: unchartedsoftware/ensemble-clustering
clusters = clusters.distinct();
代码示例来源:origin: uber/hudi
@Test
public void testTotalPutsBatching() throws Exception {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
HoodieWriteConfig config = getConfig();
HBaseIndex index = new HBaseIndex(config);
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
// start a commit and generate test data
String newCommitTime = writeClient.startCommit();
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 250);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, config, jsc);
// Insert 200 records
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
// commit this upsert
writeClient.commit(newCommitTime, writeStatues);
// Mock hbaseConnection and related entities
Connection hbaseConnection = Mockito.mock(Connection.class);
HTable table = Mockito.mock(HTable.class);
Mockito.when(hbaseConnection.getTable(TableName.valueOf(tableName))).thenReturn(table);
Mockito.when(table.get((List<Get>) anyObject())).thenReturn(new Result[0]);
// only for test, set the hbaseConnection to mocked object
index.setHbaseConnection(hbaseConnection);
// Get all the files generated
int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count();
index.updateLocation(writeStatues, jsc, hoodieTable);
// 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated,
// so each fileId ideally gets updates
Mockito.verify(table, atMost(numberOfDataFileIds)).put((List<Put>) anyObject());
}
代码示例来源:origin: locationtech/geowave
leftIndex.setName("LeftIndex").keys().map(t -> t.getBytes()[0]).distinct(4).collectAsync();
final JavaFutureAction<List<Byte>> rightFuture =
rightIndex.setName("RightIndex").keys().map(t -> t.getBytes()[0]).distinct(
4).collectAsync();
代码示例来源:origin: org.datasyslab/geospark
: resultWithDuplicates.distinct();
代码示例来源:origin: DataSystemsLab/GeoSpark
: resultWithDuplicates.distinct();
代码示例来源:origin: uber/hudi
javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
assertTrue(javaRDD.filter(
record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime()
.equals(newCommitTime))).distinct().count() == 200);
代码示例来源:origin: uber/hudi
List<String> taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect();
内容来源于网络,如有侵权,请联系作者删除!