org.apache.spark.api.java.JavaRDD.distinct()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(7.4k)|赞(0)|评价(0)|浏览(98)

本文整理了Java中org.apache.spark.api.java.JavaRDD.distinct()方法的一些代码示例,展示了JavaRDD.distinct()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaRDD.distinct()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaRDD
类名称:JavaRDD
方法名:distinct

JavaRDD.distinct介绍

暂无

代码示例

代码示例来源:origin: OryxProject/oryx

private static Map<String,Integer> buildIDIndexOneWayMap(PMML model,
                             JavaRDD<String[]> parsedTestRDD,
                             boolean user) {
 // Add to mapping everything from the model
 List<String> ids = AppPMMLUtils.getExtensionContent(model, user ? "XIDs" : "YIDs");
 Map<String,Integer> idIndex = new HashMap<>(ids.size());
 int index = 0;
 for (String id : ids) {
  idIndex.put(id, index++);
 }
 // And from the test set, which may have a few more IDs
 int offset = user ? 0 : 1;
 for (String id : parsedTestRDD.map(tokens -> tokens[offset]).distinct().collect()) {
  if (!idIndex.containsKey(id)) {
   idIndex.put(id, index++);
  }
 }
 return idIndex;
}

代码示例来源:origin: OryxProject/oryx

private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD,
                            boolean user) {
 int offset = user ? 0 : 1;
 Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset])
   .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions())
   .zipWithIndex().mapValues(Long::intValue)
   .collectAsMap();
 // Clone, due to some serialization problems with the result of collectAsMap?
 return new HashMap<>(reverseIDLookup);
}

代码示例来源:origin: SeldonIO/seldon-server

List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {

代码示例来源:origin: OryxProject/oryx

Broadcast<List<Integer>> allItemIDsBC = sparkContext.broadcast(positiveUserProducts.values().distinct().collect());

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaRDD<Tuple3<Long,Long,Long>> uniqueTriangles = trianglesWithDuplicates.distinct();

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaRDD<Tuple3<Long,Long,Long>> uniqueTriangles = trianglesWithDuplicates.distinct();

代码示例来源:origin: locationtech/geowave

private JavaRDD<Tuple2<GeoWaveInputKey, Geometry>> prepareForReproject(
  final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexedRDD,
  final int numPartitions) {
 return indexedRDD.values().distinct(numPartitions);
}

代码示例来源:origin: com.davidbracewell/mango

@Override
public SparkStream<T> distinct() {
 return new SparkStream<>(rdd.distinct());
}

代码示例来源:origin: rathboma/hadoop-framework-examples

public static JavaRDD<Tuple2<Integer,Optional<String>>> joinData(JavaPairRDD<Integer, Integer> t, JavaPairRDD<Integer, String> u){
  JavaRDD<Tuple2<Integer,Optional<String>>> leftJoinOutput = t.leftOuterJoin(u).values().distinct();
  return leftJoinOutput;
}

代码示例来源:origin: org.datavec/datavec-spark_2.11

/**
 * Get a list of unique values from the specified column.
 * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)}
 *
 * @param columnName    Name of the column to get unique values from
 * @param schema        Data schema
 * @param data          Data to get unique values from
 * @return              List of unique values
 */
public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) {
  int colIdx = schema.getIndexOfColumn(columnName);
  JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx));
  return ithColumn.distinct().collect();
}

代码示例来源:origin: org.datavec/datavec-spark

/**
 * Get a list of unique values from the specified column.
 * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)}
 *
 * @param columnName    Name of the column to get unique values from
 * @param schema        Data schema
 * @param data          Data to get unique values from
 * @return              List of unique values
 */
public static List<Writable> getUnique(String columnName, Schema schema, JavaRDD<List<Writable>> data) {
  int colIdx = schema.getIndexOfColumn(columnName);
  JavaRDD<Writable> ithColumn = data.map(new SelectColumnFunction(colIdx));
  return ithColumn.distinct().collect();
}

代码示例来源:origin: org.qcri.rheem/rheem-spark

@Override
public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate(
    ChannelInstance[] inputs,
    ChannelInstance[] outputs,
    SparkExecutor sparkExecutor,
    OptimizationContext.OperatorContext operatorContext) {
  assert inputs.length == this.getNumInputs();
  assert outputs.length == this.getNumOutputs();
  final RddChannel.Instance input = (RddChannel.Instance) inputs[0];
  final RddChannel.Instance output = (RddChannel.Instance) outputs[0];
  final JavaRDD<Type> inputRdd = input.provideRdd();
  final JavaRDD<Type> outputRdd = inputRdd.distinct(sparkExecutor.getNumDefaultPartitions());
  this.name(outputRdd);
  output.accept(outputRdd, sparkExecutor);
  return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext);
}

代码示例来源:origin: unchartedsoftware/ensemble-clustering

clusters = clusters.distinct();

代码示例来源:origin: uber/hudi

@Test
public void testTotalPutsBatching() throws Exception {
 HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
 HoodieWriteConfig config = getConfig();
 HBaseIndex index = new HBaseIndex(config);
 HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
 // start a commit and generate test data
 String newCommitTime = writeClient.startCommit();
 List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 250);
 JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
 HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, config, jsc);
 // Insert 200 records
 JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
 // commit this upsert
 writeClient.commit(newCommitTime, writeStatues);
 // Mock hbaseConnection and related entities
 Connection hbaseConnection = Mockito.mock(Connection.class);
 HTable table = Mockito.mock(HTable.class);
 Mockito.when(hbaseConnection.getTable(TableName.valueOf(tableName))).thenReturn(table);
 Mockito.when(table.get((List<Get>) anyObject())).thenReturn(new Result[0]);
 // only for test, set the hbaseConnection to mocked object
 index.setHbaseConnection(hbaseConnection);
 // Get all the files generated
 int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count();
 index.updateLocation(writeStatues, jsc, hoodieTable);
 // 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated,
 // so each fileId ideally gets updates
 Mockito.verify(table, atMost(numberOfDataFileIds)).put((List<Put>) anyObject());
}

代码示例来源:origin: locationtech/geowave

leftIndex.setName("LeftIndex").keys().map(t -> t.getBytes()[0]).distinct(4).collectAsync();
final JavaFutureAction<List<Byte>> rightFuture =
  rightIndex.setName("RightIndex").keys().map(t -> t.getBytes()[0]).distinct(
    4).collectAsync();

代码示例来源:origin: org.datasyslab/geospark

: resultWithDuplicates.distinct();

代码示例来源:origin: DataSystemsLab/GeoSpark

: resultWithDuplicates.distinct();

代码示例来源:origin: uber/hudi

javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable);
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
assertTrue(javaRDD.filter(
  record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime()
    .equals(newCommitTime))).distinct().count() == 200);

代码示例来源:origin: uber/hudi

List<String> taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect();

相关文章

微信公众号

最新文章

更多