本文整理了Java中org.apache.spark.api.java.JavaPairRDD.filter()
方法的一些代码示例,展示了JavaPairRDD.filter()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaPairRDD.filter()
方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaPairRDD
类名称:JavaPairRDD
方法名:filter
暂无
代码示例来源:origin: databricks/learning-spark
public static final JavaRDD<String> filterIPAddress(
JavaPairRDD<String, Long> ipAddressCount) {
return ipAddressCount
.filter(new IpCountGreaterThan10())
.keys();
}
代码示例来源:origin: databricks/learning-spark
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
}
String master = args[0];
String inputFile = args[1];
JavaSparkContext sc = new JavaSparkContext(
master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> input = sc.textFile(inputFile);
PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String x) {
return new Tuple2(x.split(" ")[0], x);
}
};
Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {
@Override
public Boolean call(Tuple2<String, String> input) {
return (input._2().length() < 20);
}
};
JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
Map<String, String> resultMap = result.collectAsMap();
for (Entry<String, String> entry : resultMap.entrySet()) {
System.out.println(entry.getKey() + ":" + entry.getValue());
}
}
}
代码示例来源:origin: OryxProject/oryx
timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
代码示例来源:origin: SeldonIO/seldon-server
final String currentClient = client;
JavaPairRDD<String, ActionData> filtered_by_client = pairs.filter(new Function<Tuple2<String, ActionData>, Boolean>() {
JavaPairRDD<String, ActionData> nonZeroUserIds = filtered_by_client.filter(new Function<Tuple2<String, ActionData>, Boolean>() {
代码示例来源:origin: org.apache.spark/spark-core_2.11
@Test
public void leftOuterJoin() {
JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 1),
new Tuple2<>(1, 2),
new Tuple2<>(2, 1),
new Tuple2<>(3, 1)
));
JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 'x'),
new Tuple2<>(2, 'y'),
new Tuple2<>(2, 'z'),
new Tuple2<>(4, 'w')
));
List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
Assert.assertEquals(5, joined.size());
Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
Assert.assertEquals(3, firstUnmatched._1().intValue());
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
@Test
public void leftOuterJoin() {
JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 1),
new Tuple2<>(1, 2),
new Tuple2<>(2, 1),
new Tuple2<>(3, 1)
));
JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 'x'),
new Tuple2<>(2, 'y'),
new Tuple2<>(2, 'z'),
new Tuple2<>(4, 'w')
));
List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
Assert.assertEquals(5, joined.size());
Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
Assert.assertEquals(3, firstUnmatched._1().intValue());
}
代码示例来源:origin: org.apache.spark/spark-core_2.10
@SuppressWarnings("unchecked")
@Test
public void leftOuterJoin() {
JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 1),
new Tuple2<>(1, 2),
new Tuple2<>(2, 1),
new Tuple2<>(3, 1)
));
JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 'x'),
new Tuple2<>(2, 'y'),
new Tuple2<>(2, 'z'),
new Tuple2<>(4, 'w')
));
List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
assertEquals(5, joined.size());
Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched =
rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
assertEquals(3, firstUnmatched._1().intValue());
}
代码示例来源:origin: org.apache.spark/spark-core
@Test
public void leftOuterJoin() {
JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 1),
new Tuple2<>(1, 2),
new Tuple2<>(2, 1),
new Tuple2<>(3, 1)
));
JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 'x'),
new Tuple2<>(2, 'y'),
new Tuple2<>(2, 'z'),
new Tuple2<>(4, 'w')
));
List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
Assert.assertEquals(5, joined.size());
Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
Assert.assertEquals(3, firstUnmatched._1().intValue());
}
代码示例来源:origin: org.apache.spark/spark-core_2.11
@SuppressWarnings("unchecked")
@Test
public void leftOuterJoin() {
JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 1),
new Tuple2<>(1, 2),
new Tuple2<>(2, 1),
new Tuple2<>(3, 1)
));
JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 'x'),
new Tuple2<>(2, 'y'),
new Tuple2<>(2, 'z'),
new Tuple2<>(4, 'w')
));
List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
assertEquals(5, joined.size());
Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched =
rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
assertEquals(3, firstUnmatched._1().intValue());
}
代码示例来源:origin: org.apache.spark/spark-core
@SuppressWarnings("unchecked")
@Test
public void leftOuterJoin() {
JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 1),
new Tuple2<>(1, 2),
new Tuple2<>(2, 1),
new Tuple2<>(3, 1)
));
JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
new Tuple2<>(1, 'x'),
new Tuple2<>(2, 'y'),
new Tuple2<>(2, 'z'),
new Tuple2<>(4, 'w')
));
List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
assertEquals(5, joined.size());
Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched =
rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
assertEquals(3, firstUnmatched._1().intValue());
}
代码示例来源:origin: OryxProject/oryx
aggregated.filter(kv -> !Double.isNaN(kv._2()));
代码示例来源:origin: OryxProject/oryx
/**
* Combines {@link Rating}s with the same user/item into one, with score as the sum of
* all of the scores.
*/
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
if (implicit) {
// TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
// they don't guarantee the delete elements are properly handled
aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
} else {
// For non-implicit, last wins.
aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
}
JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
aggregated.filter(kv -> !Double.isNaN(kv._2()));
if (logStrength) {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
Math.log1p(userProductScore._2() / epsilon)));
} else {
return noNaN.map(userProductScore -> new Rating(
userProductScore._1()._1(),
userProductScore._1()._2(),
userProductScore._2()));
}
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
anagrams.filter((Tuple2<String, Map<String, Integer>> entry) -> {
Map<String, Integer> map = entry._2;
if (map.size() > 1) {
代码示例来源:origin: mahmoudparsian/data-algorithms-book
= anagramsAsSet.filter((Tuple2<String, Set<String>> entry) -> {
Set<String> set = entry._2;
if (set.size() > 1) {
代码示例来源:origin: mahmoudparsian/data-algorithms-book
for (int i = 0; i < centroids.size(); i++) {
final int index = i;
List<Tuple2<String, Vector>> samples = data.filter(new Function<Tuple2<String, Vector>, Boolean>() {
@Override
public Boolean call(Tuple2<String, Vector> in) throws Exception {
代码示例来源:origin: mahmoudparsian/data-algorithms-book
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
@Override
public Boolean call(Tuple2<Double, Double> pl) {
代码示例来源:origin: mahmoudparsian/data-algorithms-book
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
@Override
public Boolean call(Tuple2<Double, Double> pl) {
代码示例来源:origin: mahmoudparsian/data-algorithms-book
double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2()))
.count() / (double) test.count();
代码示例来源:origin: mahmoudparsian/data-algorithms-book
extracted.filter((Tuple2<Tuple3<String, String, String>, LogStatistics> s) -> {
Tuple3<String, String, String> t3 = s._1;
return (t3._1() != null); // exclude Tuple3(null,null,null)
代码示例来源:origin: mahmoudparsian/data-algorithms-book
extracted.filter(new Function<
Tuple2<Tuple3<String, String, String>, LogStatistics>,
Boolean
内容来源于网络,如有侵权,请联系作者删除!