org.apache.spark.mllib.linalg.Vector类的使用及代码示例

x33g5p2x  于2022-02-01 转载在 其他  
字(11.7k)|赞(0)|评价(0)|浏览(137)

本文整理了Java中org.apache.spark.mllib.linalg.Vector类的一些代码示例,展示了Vector类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Vector类的具体详情如下:
包路径:org.apache.spark.mllib.linalg.Vector
类名称:Vector

Vector介绍

暂无

代码示例

代码示例来源:origin: OryxProject/oryx

private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) {
 return evalData.mapToPair(vector -> {
  double closestDist = Double.POSITIVE_INFINITY;
  int minClusterID = Integer.MIN_VALUE;
  double[] vec = vector.toArray();
  DistanceFn<double[]> distanceFn = getDistanceFn();
  Map<Integer,ClusterInfo> clusters = getClustersByID();
  for (ClusterInfo cluster : clusters.values()) {
   double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
   if (distance < closestDist) {
    closestDist = distance;
    minClusterID = cluster.getID();
   }
  }
  Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
  return new Tuple2<>(minClusterID, vec);
 }).groupByKey();
}

代码示例来源:origin: mahmoudparsian/data-algorithms-book

THE_LOGGER.info("Number of data records " + data.count());
for (Vector t : centroids) {
  outputWriter.write("" + t.apply(0) + " " + t.apply(1) + " " + t.apply(2) + "\n");

代码示例来源:origin: mahmoudparsian/data-algorithms-book

static double squaredDistance(Vector a, Vector b) {
  double distance = 0.0;
  int size = a.size();
  for (int i = 0; i < size; i++) {
    double diff = a.apply(i) - b.apply(i);
    distance += diff * diff;
  }
  return distance;
}

代码示例来源:origin: org.apache.spark/spark-mllib

@Test
@SuppressWarnings("unchecked")
public void testUniformVectorRDD() {
 long m = 100L;
 int n = 10;
 int p = 2;
 long seed = 1L;
 JavaRDD<Vector> rdd1 = uniformJavaVectorRDD(jsc, m, n);
 JavaRDD<Vector> rdd2 = uniformJavaVectorRDD(jsc, m, n, p);
 JavaRDD<Vector> rdd3 = uniformJavaVectorRDD(jsc, m, n, p, seed);
 for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) {
  Assert.assertEquals(m, rdd.count());
  Assert.assertEquals(n, rdd.first().size());
 }
}

代码示例来源:origin: edu.usc.ir/age-predictor-cli

.cache();
JavaRDD<Row> samples = data.map(
  new Function<String, Row>() {
  public Row call(String s) {
  }).cache();
JavaRDD<Row> validSamples = samples.filter(
final LassoModel model = algorithm.run(JavaRDD.toRDD(parsedData));
System.out.println("Coefficients: " + Arrays.toString(model.weights().toArray()));
System.out.println("Intercept: " + model.intercept());

代码示例来源:origin: org.apache.spark/spark-mllib_2.10

@Test
public void tfIdf() {
 // The tests are to check Java compatibility.
 HashingTF tf = new HashingTF();
 @SuppressWarnings("unchecked")
 JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList(
  Arrays.asList("this is a sentence".split(" ")),
  Arrays.asList("this is another sentence".split(" ")),
  Arrays.asList("this is still a sentence".split(" "))), 2);
 JavaRDD<Vector> termFreqs = tf.transform(documents);
 termFreqs.collect();
 IDF idf = new IDF();
 JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
 List<Vector> localTfIdfs = tfIdfs.collect();
 int indexOfThis = tf.indexOf("this");
 for (Vector v : localTfIdfs) {
  Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
 }
}

代码示例来源:origin: OryxProject/oryx

return trainPointData.mapPartitions(data -> {
  DecisionTreeModel[] trees = model.trees();
  List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length).
    mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList());
  data.forEachRemaining(datum -> {
   double[] featureVector = datum.features().toArray();
   for (int i = 0; i < trees.length; i++) {
    DecisionTreeModel tree = trees[i];
  return Collections.singleton(treeNodeIDCounts).iterator();
).reduce((a, b) -> {
  Preconditions.checkArgument(a.size() == b.size());
  for (int i = 0; i < a.size(); i++) {

代码示例来源:origin: OryxProject/oryx

private ClusteringModel pmmlClusteringModel(KMeansModel model,
                      Map<Integer,Long> clusterSizesMap) {
 Vector[] clusterCenters = model.clusterCenters();
 List<ClusteringField> clusteringFields = new ArrayList<>();
 for (int i = 0; i < inputSchema.getNumFeatures(); i++) {
  if (inputSchema.isActive(i)) {
   FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i));
   ClusteringField clusteringField =
     new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE);
   clusteringFields.add(clusteringField);
  }
 }
 List<Cluster> clusters = new ArrayList<>(clusterCenters.length);
 for (int i = 0; i < clusterCenters.length; i++) {
  clusters.add(new Cluster().setId(Integer.toString(i))
           .setSize(clusterSizesMap.get(i).intValue())
           .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray())));
 }
 return new ClusteringModel(
   MiningFunction.CLUSTERING,
   ClusteringModel.ModelClass.CENTER_BASED,
   clusters.size(),
   AppPMMLUtils.buildMiningSchema(inputSchema),
   new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE).setMeasure(new SquaredEuclidean()),
   clusteringFields,
   clusters);
}

代码示例来源:origin: ddf-project/DDF

LabeledPoint s = (LabeledPoint) sample;
 label = s.label();
 features = s.features().toArray();
Vector vector = (Vector) sample;
if (mHasLabels) {
 label = vector.apply(vector.size() - 1);
 features = Arrays.copyOf(vector.toArray(), vector.size() - 1);
} else {
 features = vector.toArray();

代码示例来源:origin: phuonglh/vn.vitk

int d = features.size();
for (int k : labels) {
  if (k > 0) {
    for (int j : features.toSparse().indices())
      score[k] += weights.apply((k-1) * d + j);
    if (score[k] > maxScore) {
      maxScore = score[k];

代码示例来源:origin: org.apache.spark/spark-mllib_2.11

@Test
public void testConvertVectorColumnsToAndFromML() {
 Vector x = Vectors.dense(2.0);
 Dataset<Row> dataset = spark.createDataFrame(
  Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class
 ).select("label", "features");
 Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset);
 Row new1 = newDataset1.first();
 Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1);
 Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first();
 Assert.assertEquals(new1, new2);
 Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first();
 Assert.assertEquals(RowFactory.create(1.0, x), old1);
}

代码示例来源:origin: org.apache.spark/spark-mllib_2.11

@Test
@SuppressWarnings("unchecked")
public void testNormalVectorRDD() {
 long m = 100L;
 int n = 10;
 int p = 2;
 long seed = 1L;
 JavaRDD<Vector> rdd1 = normalJavaVectorRDD(jsc, m, n);
 JavaRDD<Vector> rdd2 = normalJavaVectorRDD(jsc, m, n, p);
 JavaRDD<Vector> rdd3 = normalJavaVectorRDD(jsc, m, n, p, seed);
 for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) {
  Assert.assertEquals(m, rdd.count());
  Assert.assertEquals(n, rdd.first().size());
 }
}

代码示例来源:origin: org.apache.spark/spark-mllib

@Test
public void tfIdf() {
 // The tests are to check Java compatibility.
 HashingTF tf = new HashingTF();
 @SuppressWarnings("unchecked")
 JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList(
  Arrays.asList("this is a sentence".split(" ")),
  Arrays.asList("this is another sentence".split(" ")),
  Arrays.asList("this is still a sentence".split(" "))), 2);
 JavaRDD<Vector> termFreqs = tf.transform(documents);
 termFreqs.collect();
 IDF idf = new IDF();
 JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
 List<Vector> localTfIdfs = tfIdfs.collect();
 int indexOfThis = tf.indexOf("this");
 for (Vector v : localTfIdfs) {
  Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
 }
}

代码示例来源:origin: OryxProject/oryx

/**
 * @param trainPointData data to run down trees
 * @param model random decision forest model to count on
 * @return map of predictor index to the number of training examples that reached a
 *  node whose decision is based on that feature. The index is among predictors, not all
 *  features, since there are fewer predictors than features. That is, the index will
 *  match the one used in the {@link RandomForestModel}.
 */
private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
                           RandomForestModel model) {
 return trainPointData.mapPartitions(data -> {
   IntLongHashMap featureIndexCount = new IntLongHashMap();
   data.forEachRemaining(datum -> {
    double[] featureVector = datum.features().toArray();
    for (DecisionTreeModel tree : model.trees()) {
     org.apache.spark.mllib.tree.model.Node node = tree.topNode();
     // This logic cloned from Node.predict:
     while (!node.isLeaf()) {
      Split split = node.split().get();
      int featureIndex = split.feature();
      // Count feature
      featureIndexCount.addToValue(featureIndex, 1);
      node = nextNode(featureVector, node, split, featureIndex);
     }
    }
   });
   return Collections.singleton(featureIndexCount).iterator();
 }).reduce(RDFUpdate::merge);
}

代码示例来源:origin: mahmoudparsian/data-algorithms-book

static void debug(String record, Vector v) {
  THE_LOGGER.info("DEBUG started:");
  double[] d = v.toArray();
  StringBuilder builder = new StringBuilder();
  builder.append("DEBUG[record=");
  builder.append(record);
  builder.append("]:");
  for (int i=0; i < d.length; i++){
    builder.append("\t");
    builder.append(d[i]);
  }
  THE_LOGGER.info(builder.toString());
}

代码示例来源:origin: org.apache.spark/spark-mllib_2.10

@Test
public void testConvertVectorColumnsToAndFromML() {
 Vector x = Vectors.dense(2.0);
 Dataset<Row> dataset = spark.createDataFrame(
  Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class
 ).select("label", "features");
 Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset);
 Row new1 = newDataset1.first();
 Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1);
 Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first();
 Assert.assertEquals(new1, new2);
 Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first();
 Assert.assertEquals(RowFactory.create(1.0, x), old1);
}

代码示例来源:origin: org.apache.spark/spark-mllib_2.10

@Test
@SuppressWarnings("unchecked")
public void testNormalVectorRDD() {
 long m = 100L;
 int n = 10;
 int p = 2;
 long seed = 1L;
 JavaRDD<Vector> rdd1 = normalJavaVectorRDD(jsc, m, n);
 JavaRDD<Vector> rdd2 = normalJavaVectorRDD(jsc, m, n, p);
 JavaRDD<Vector> rdd3 = normalJavaVectorRDD(jsc, m, n, p, seed);
 for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) {
  Assert.assertEquals(m, rdd.count());
  Assert.assertEquals(n, rdd.first().size());
 }
}

代码示例来源:origin: org.apache.spark/spark-mllib

@Test
public void tfIdfMinimumDocumentFrequency() {
 // The tests are to check Java compatibility.
 HashingTF tf = new HashingTF();
 @SuppressWarnings("unchecked")
 JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList(
  Arrays.asList("this is a sentence".split(" ")),
  Arrays.asList("this is another sentence".split(" ")),
  Arrays.asList("this is still a sentence".split(" "))), 2);
 JavaRDD<Vector> termFreqs = tf.transform(documents);
 termFreqs.collect();
 IDF idf = new IDF(2);
 JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
 List<Vector> localTfIdfs = tfIdfs.collect();
 int indexOfThis = tf.indexOf("this");
 for (Vector v : localTfIdfs) {
  Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
 }
}

代码示例来源:origin: OryxProject/oryx

/**
 * @param evalData points to cluster for evaluation
 * @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid,
 *  and sum of squared distances
 */
JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) {
 return evalData.mapToPair(vector -> {
  double closestDist = Double.POSITIVE_INFINITY;
  int minClusterID = Integer.MIN_VALUE;
  double[] vec = vector.toArray();
  for (ClusterInfo cluster : clusters.values()) {
   double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
   if (distance < closestDist) {
    closestDist = distance;
    minClusterID = cluster.getID();
   }
  }
  Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
  return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist));
 }).reduceByKey(ClusterMetric::add);
}

代码示例来源:origin: ypriverol/spark-java8

LOGGER.info("Number of data records " + data.count());
for (Vector t : centroids) {
  outputWriter.write("" + t.apply(0) + " " + t.apply(1) + " " + t.apply(2) + "\n");

相关文章

微信公众号

最新文章

更多