本文整理了Java中org.apache.spark.api.java.JavaRDD.filter()
方法的一些代码示例,展示了JavaRDD.filter()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaRDD.filter()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaRDD
类名称:JavaRDD
方法名:filter
暂无
代码示例来源:origin: databricks/learning-spark
public static void main(String[] args) throws Exception {
String master;
if (args.length > 0) {
master = args[0];
} else {
master = "local";
}
JavaSparkContext sc = new JavaSparkContext(
master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
JavaRDD<Integer> squared = rdd.map(
new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}});
JavaRDD<Integer> result = squared.filter(
new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }});
System.out.println(StringUtils.join(result.collect(), ","));
}
}
代码示例来源:origin: databricks/learning-spark
public static void main(String[] args) throws Exception {
if (args.length != 3) {
throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key");
}
String master = args[0];
String csvInput = args[1];
String outputFile = args[2];
final String key = args[3];
JavaSparkContext sc = new JavaSparkContext(
master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaPairRDD<String, String> csvData = sc.wholeTextFiles(csvInput);
JavaRDD<String[]> keyedRDD = csvData.flatMap(new ParseLine());
JavaRDD<String[]> result =
keyedRDD.filter(new Function<String[], Boolean>() {
public Boolean call(String[] input) { return input[0].equals(key); }});
result.saveAsTextFile(outputFile);
}
}
代码示例来源:origin: databricks/learning-spark
public static void main(String[] args) throws Exception {
if (args.length != 3) {
throw new Exception("Usage BasicLoadJson [sparkMaster] [jsoninput] [jsonoutput]");
}
String master = args[0];
String fileName = args[1];
String outfile = args[2];
JavaSparkContext sc = new JavaSparkContext(
master, "basicloadjson", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> input = sc.textFile(fileName);
JavaRDD<Person> result = input.mapPartitions(new ParseJson()).filter(new LikesPandas());
JavaRDD<String> formatted = result.mapPartitions(new WriteJson());
formatted.saveAsTextFile(outfile);
}
}
代码示例来源:origin: OryxProject/oryx
/**
* Implementation which splits based solely on time. It will return approximately
* the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training
* data and the rest as test data.
*/
@Override
protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) {
// Rough approximation; assumes timestamps are fairly evenly distributed
StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats();
long minTime = (long) maxMin.min();
long maxTime = (long) maxMin.max();
log.info("New data timestamp range: {} - {}", minTime, maxTime);
long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime));
log.info("Splitting at timestamp {}", approxTestTrainBoundary);
JavaRDD<String> newTrainData = newData.filter(
line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary);
JavaRDD<String> testData = newData.filter(
line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary);
return new Pair<>(newTrainData, testData);
}
代码示例来源:origin: OryxProject/oryx
static double accuracy(DecisionForest forest, JavaRDD<Example> examples) {
long total = examples.count();
if (total == 0) {
return 0.0;
}
long correct = examples.filter(example -> {
CategoricalPrediction prediction = (CategoricalPrediction) forest.predict(example);
CategoricalFeature target = (CategoricalFeature) example.getTarget();
return prediction.getMostProbableCategoryEncoding() == target.getEncoding();
}).count();
return (double) correct / total;
}
代码示例来源:origin: SeldonIO/seldon-server
dataSet = dataSet.filter(clientFilter);
JavaRDD<String> json_only = json_only_with_zeros.filter(new Function<String, Boolean>() {
代码示例来源:origin: deeplearning4j/dl4j-examples
JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr));
代码示例来源:origin: org.apache.spark/spark-core_2.10
@Test
public void isEmpty() {
assertTrue(sc.emptyRDD().isEmpty());
assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty());
assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty());
assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(i -> i < 0).isEmpty());
assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(i -> i > 1).isEmpty());
}
代码示例来源:origin: org.apache.spark/spark-core
@Test
public void isEmpty() {
assertTrue(sc.emptyRDD().isEmpty());
assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty());
assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty());
assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(i -> i < 0).isEmpty());
assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(i -> i > 1).isEmpty());
}
代码示例来源:origin: org.apache.spark/spark-core_2.11
@Test
public void isEmpty() {
assertTrue(sc.emptyRDD().isEmpty());
assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty());
assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty());
assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(i -> i < 0).isEmpty());
assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(i -> i > 1).isEmpty());
}
代码示例来源:origin: databricks/learning-spark
JavaRDD<String> validCallSigns = callSigns.filter(
new Function<String, Boolean>(){ public Boolean call(String callSign) {
Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z");
代码示例来源:origin: deeplearning4j/dl4j-examples
logLines = logLines.filter(new Function<String,Boolean>() {
@Override
public Boolean call(String s) throws Exception {
代码示例来源:origin: apache/tinkerpop
@Override
public boolean cp(final String sourceLocation, final String targetLocation) {
final List<String> rdds = Spark.getRDDs().stream().filter(r -> r.name().startsWith(sourceLocation)).map(RDD::name).collect(Collectors.toList());
if (rdds.size() == 0)
return false;
for (final String rdd : rdds) {
Spark.getRDD(rdd).toJavaRDD().filter(a -> true).setName(rdd.equals(sourceLocation) ? targetLocation : rdd.replace(sourceLocation, targetLocation)).cache().count();
// TODO: this should use the original storage level
}
return true;
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaRDD<String> filteredRDD = records.filter((String record) -> {
String firstChar = record.substring(0,1);
if ( firstChar.equals("@") ||
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaRDD<String> filteredRDD = records.filter(new Function<String,Boolean>() {
@Override
public Boolean call(String record) {
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaRDD<String> filtered = biosets.filter((String record) -> {
String ref = REF.value();
String[] tokens = record.split(",");
代码示例来源:origin: mahmoudparsian/data-algorithms-book
JavaRDD<String> filtered = biosets.filter(new Function<String,Boolean>() {
@Override
public Boolean call(String record) {
代码示例来源:origin: uber/hudi
@VisibleForTesting
public int getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) {
return Math.toIntExact(Math.max(writeStatusRDD
.filter(w -> w.getStat().getNumInserts() > 0).count(), 1));
}
代码示例来源:origin: uber/hudi
/**
* Filter out HoodieRecords that already exists in the output folder. This is useful in
* deduplication.
*
* @param hoodieRecords Input RDD of Hoodie records.
* @return A subset of hoodieRecords RDD, with existing records filtered out.
*/
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords);
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}
代码示例来源:origin: apache/tinkerpop
.filter(vertexWritable -> ElementHelper.idExists(vertexWritable.get().id(), graphStepIds)) // ensure vertex ids are in V(x)
.flatMap(vertexWritable -> {
if (identityTraversal) // g.V.count()-style (identity)
内容来源于网络,如有侵权,请联系作者删除!