org.apache.spark.api.java.JavaRDD.rdd()方法的使用及代码示例

x33g5p2x  于2022-01-21 转载在 其他  
字(7.1k)|赞(0)|评价(0)|浏览(192)

本文整理了Java中org.apache.spark.api.java.JavaRDD.rdd()方法的一些代码示例,展示了JavaRDD.rdd()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。JavaRDD.rdd()方法的具体详情如下:
包路径:org.apache.spark.api.java.JavaRDD
类名称:JavaRDD
方法名:rdd

JavaRDD.rdd介绍

暂无

代码示例

代码示例来源:origin: OryxProject/oryx

/**
 * @param javaRDD RDD whose underlying RDD must be an instance of {@code HasOffsetRanges},
 *  such as {@code KafkaRDD}
 */
@Override
public void call(JavaRDD<T> javaRDD) {
 OffsetRange[] ranges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();
 Map<Pair<String,Integer>,Long> newOffsets = new HashMap<>(ranges.length);
 for (OffsetRange range : ranges) {
  newOffsets.put(new Pair<>(range.topic(), range.partition()), range.untilOffset());
 }
 log.info("Updating offsets: {}", newOffsets);
 KafkaUtils.setOffsets(inputTopicLockMaster, group, newOffsets);
}

代码示例来源:origin: OryxProject/oryx

/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
            JavaRDD<String> trainData,
            List<?> hyperParameters,
            Path candidatePath) {
 int numClusters = (Integer) hyperParameters.get(0);
 Preconditions.checkArgument(numClusters > 1);
 log.info("Building KMeans Model with {} clusters", numClusters);
 JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
 KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                     numberOfRuns, initializationStrategy);
 return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}

代码示例来源:origin: OryxProject/oryx

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
   testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,Object>> testUserProducts =
   (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
 JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
 double mse = predictions.mapToPair(
   rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
 ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
  double diff = valuePrediction._1() - valuePrediction._2();
  return diff * diff;
 }).mean();
 return Math.sqrt(mse);
}

代码示例来源:origin: databricks/learning-spark

LogisticRegressionModel model = lrLearner.run(trainingData.rdd());

代码示例来源:origin: OryxProject/oryx

/**
 * Default implementation which randomly splits new data into train/test sets.
 * This handles the case where {@link #getTestFraction()} is not 0 or 1.
 *
 * @param newData data that has arrived in the current input batch
 * @return a {@link Pair} of train, test {@link RDD}s.
 */
protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) {
 RDD<M>[] testTrainRDDs = newData.rdd().randomSplit(
   new double[]{1.0 - testFraction, testFraction},
   RandomManager.getRandom().nextLong());
 return new Pair<>(newData.wrapRDD(testTrainRDDs[0]),
          newData.wrapRDD(testTrainRDDs[1]));
}

代码示例来源:origin: org.apache.spark/spark-core

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

代码示例来源:origin: org.apache.spark/spark-core_2.11

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

代码示例来源:origin: org.apache.spark/spark-core_2.10

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

代码示例来源:origin: OryxProject/oryx

RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);

代码示例来源:origin: mahmoudparsian/data-algorithms-book

final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

代码示例来源:origin: mahmoudparsian/data-algorithms-book

final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

代码示例来源:origin: mahmoudparsian/data-algorithms-book

).rdd()).mean();

代码示例来源:origin: mahmoudparsian/data-algorithms-book

LogisticRegressionModel model = logisticRegression.run(trainingData.rdd());

代码示例来源:origin: mahmoudparsian/data-algorithms-book

LogisticRegressionModel model = learner.run(trainingData.rdd());

代码示例来源:origin: mahmoudparsian/data-algorithms-book

.run(training.rdd());

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

代码示例来源:origin: mahmoudparsian/data-algorithms-book

JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

代码示例来源:origin: org.apache.spark/spark-mllib_2.10

@Test
public void runUsingStaticMethods() {
 JavaRDD<LabeledPoint> testRDD = jsc.parallelize(POINTS, 2).cache();
 NaiveBayesModel model1 = NaiveBayes.train(testRDD.rdd());
 int numAccurate1 = validatePrediction(POINTS, model1);
 Assert.assertEquals(POINTS.size(), numAccurate1);
 NaiveBayesModel model2 = NaiveBayes.train(testRDD.rdd(), 0.5);
 int numAccurate2 = validatePrediction(POINTS, model2);
 Assert.assertEquals(POINTS.size(), numAccurate2);
}

代码示例来源:origin: org.apache.spark/spark-mllib_2.10

@Test
public void testPredictJavaRDD() {
 JavaRDD<LabeledPoint> examples = jsc.parallelize(POINTS, 2).cache();
 NaiveBayesModel model = NaiveBayes.train(examples.rdd());
 JavaRDD<Vector> vectors = examples.map(LabeledPoint::features);
 JavaRDD<Double> predictions = model.predict(vectors);
 // Should be able to get the first prediction.
 predictions.first();
}

代码示例来源:origin: org.apache.spark/spark-mllib_2.11

@Test
public void runImplicitALSUsingStaticMethods() {
 int features = 1;
 int iterations = 15;
 int users = 80;
 int products = 160;
 Tuple3<List<Rating>, double[], double[]> testData =
  ALSSuite.generateRatingsAsJava(users, products, features, 0.7, true, false);
 JavaRDD<Rating> data = jsc.parallelize(testData._1());
 MatrixFactorizationModel model = ALS.trainImplicit(data.rdd(), features, iterations);
 validatePrediction(model, users, products, testData._2(), 0.4, true, testData._3());
}

相关文章

微信公众号

最新文章

更多