本文整理了Java中org.apache.spark.mllib.linalg.Vector
类的一些代码示例,展示了Vector
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Vector
类的具体详情如下:
包路径:org.apache.spark.mllib.linalg.Vector
类名称:Vector
暂无
代码示例来源:origin: OryxProject/oryx
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) {
return evalData.mapToPair(vector -> {
double closestDist = Double.POSITIVE_INFINITY;
int minClusterID = Integer.MIN_VALUE;
double[] vec = vector.toArray();
DistanceFn<double[]> distanceFn = getDistanceFn();
Map<Integer,ClusterInfo> clusters = getClustersByID();
for (ClusterInfo cluster : clusters.values()) {
double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
if (distance < closestDist) {
closestDist = distance;
minClusterID = cluster.getID();
}
}
Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
return new Tuple2<>(minClusterID, vec);
}).groupByKey();
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
THE_LOGGER.info("Number of data records " + data.count());
for (Vector t : centroids) {
outputWriter.write("" + t.apply(0) + " " + t.apply(1) + " " + t.apply(2) + "\n");
代码示例来源:origin: mahmoudparsian/data-algorithms-book
static double squaredDistance(Vector a, Vector b) {
double distance = 0.0;
int size = a.size();
for (int i = 0; i < size; i++) {
double diff = a.apply(i) - b.apply(i);
distance += diff * diff;
}
return distance;
}
代码示例来源:origin: org.apache.spark/spark-mllib
@Test
@SuppressWarnings("unchecked")
public void testUniformVectorRDD() {
long m = 100L;
int n = 10;
int p = 2;
long seed = 1L;
JavaRDD<Vector> rdd1 = uniformJavaVectorRDD(jsc, m, n);
JavaRDD<Vector> rdd2 = uniformJavaVectorRDD(jsc, m, n, p);
JavaRDD<Vector> rdd3 = uniformJavaVectorRDD(jsc, m, n, p, seed);
for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) {
Assert.assertEquals(m, rdd.count());
Assert.assertEquals(n, rdd.first().size());
}
}
代码示例来源:origin: edu.usc.ir/age-predictor-cli
.cache();
JavaRDD<Row> samples = data.map(
new Function<String, Row>() {
public Row call(String s) {
}).cache();
JavaRDD<Row> validSamples = samples.filter(
final LassoModel model = algorithm.run(JavaRDD.toRDD(parsedData));
System.out.println("Coefficients: " + Arrays.toString(model.weights().toArray()));
System.out.println("Intercept: " + model.intercept());
代码示例来源:origin: org.apache.spark/spark-mllib_2.10
@Test
public void tfIdf() {
// The tests are to check Java compatibility.
HashingTF tf = new HashingTF();
@SuppressWarnings("unchecked")
JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList(
Arrays.asList("this is a sentence".split(" ")),
Arrays.asList("this is another sentence".split(" ")),
Arrays.asList("this is still a sentence".split(" "))), 2);
JavaRDD<Vector> termFreqs = tf.transform(documents);
termFreqs.collect();
IDF idf = new IDF();
JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
List<Vector> localTfIdfs = tfIdfs.collect();
int indexOfThis = tf.indexOf("this");
for (Vector v : localTfIdfs) {
Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
}
}
代码示例来源:origin: OryxProject/oryx
return trainPointData.mapPartitions(data -> {
DecisionTreeModel[] trees = model.trees();
List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length).
mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList());
data.forEachRemaining(datum -> {
double[] featureVector = datum.features().toArray();
for (int i = 0; i < trees.length; i++) {
DecisionTreeModel tree = trees[i];
return Collections.singleton(treeNodeIDCounts).iterator();
).reduce((a, b) -> {
Preconditions.checkArgument(a.size() == b.size());
for (int i = 0; i < a.size(); i++) {
代码示例来源:origin: OryxProject/oryx
private ClusteringModel pmmlClusteringModel(KMeansModel model,
Map<Integer,Long> clusterSizesMap) {
Vector[] clusterCenters = model.clusterCenters();
List<ClusteringField> clusteringFields = new ArrayList<>();
for (int i = 0; i < inputSchema.getNumFeatures(); i++) {
if (inputSchema.isActive(i)) {
FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i));
ClusteringField clusteringField =
new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE);
clusteringFields.add(clusteringField);
}
}
List<Cluster> clusters = new ArrayList<>(clusterCenters.length);
for (int i = 0; i < clusterCenters.length; i++) {
clusters.add(new Cluster().setId(Integer.toString(i))
.setSize(clusterSizesMap.get(i).intValue())
.setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray())));
}
return new ClusteringModel(
MiningFunction.CLUSTERING,
ClusteringModel.ModelClass.CENTER_BASED,
clusters.size(),
AppPMMLUtils.buildMiningSchema(inputSchema),
new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE).setMeasure(new SquaredEuclidean()),
clusteringFields,
clusters);
}
代码示例来源:origin: ddf-project/DDF
LabeledPoint s = (LabeledPoint) sample;
label = s.label();
features = s.features().toArray();
Vector vector = (Vector) sample;
if (mHasLabels) {
label = vector.apply(vector.size() - 1);
features = Arrays.copyOf(vector.toArray(), vector.size() - 1);
} else {
features = vector.toArray();
代码示例来源:origin: phuonglh/vn.vitk
int d = features.size();
for (int k : labels) {
if (k > 0) {
for (int j : features.toSparse().indices())
score[k] += weights.apply((k-1) * d + j);
if (score[k] > maxScore) {
maxScore = score[k];
代码示例来源:origin: org.apache.spark/spark-mllib_2.11
@Test
public void testConvertVectorColumnsToAndFromML() {
Vector x = Vectors.dense(2.0);
Dataset<Row> dataset = spark.createDataFrame(
Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class
).select("label", "features");
Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset);
Row new1 = newDataset1.first();
Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1);
Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first();
Assert.assertEquals(new1, new2);
Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first();
Assert.assertEquals(RowFactory.create(1.0, x), old1);
}
代码示例来源:origin: org.apache.spark/spark-mllib_2.11
@Test
@SuppressWarnings("unchecked")
public void testNormalVectorRDD() {
long m = 100L;
int n = 10;
int p = 2;
long seed = 1L;
JavaRDD<Vector> rdd1 = normalJavaVectorRDD(jsc, m, n);
JavaRDD<Vector> rdd2 = normalJavaVectorRDD(jsc, m, n, p);
JavaRDD<Vector> rdd3 = normalJavaVectorRDD(jsc, m, n, p, seed);
for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) {
Assert.assertEquals(m, rdd.count());
Assert.assertEquals(n, rdd.first().size());
}
}
代码示例来源:origin: org.apache.spark/spark-mllib
@Test
public void tfIdf() {
// The tests are to check Java compatibility.
HashingTF tf = new HashingTF();
@SuppressWarnings("unchecked")
JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList(
Arrays.asList("this is a sentence".split(" ")),
Arrays.asList("this is another sentence".split(" ")),
Arrays.asList("this is still a sentence".split(" "))), 2);
JavaRDD<Vector> termFreqs = tf.transform(documents);
termFreqs.collect();
IDF idf = new IDF();
JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
List<Vector> localTfIdfs = tfIdfs.collect();
int indexOfThis = tf.indexOf("this");
for (Vector v : localTfIdfs) {
Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
}
}
代码示例来源:origin: OryxProject/oryx
/**
* @param trainPointData data to run down trees
* @param model random decision forest model to count on
* @return map of predictor index to the number of training examples that reached a
* node whose decision is based on that feature. The index is among predictors, not all
* features, since there are fewer predictors than features. That is, the index will
* match the one used in the {@link RandomForestModel}.
*/
private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
RandomForestModel model) {
return trainPointData.mapPartitions(data -> {
IntLongHashMap featureIndexCount = new IntLongHashMap();
data.forEachRemaining(datum -> {
double[] featureVector = datum.features().toArray();
for (DecisionTreeModel tree : model.trees()) {
org.apache.spark.mllib.tree.model.Node node = tree.topNode();
// This logic cloned from Node.predict:
while (!node.isLeaf()) {
Split split = node.split().get();
int featureIndex = split.feature();
// Count feature
featureIndexCount.addToValue(featureIndex, 1);
node = nextNode(featureVector, node, split, featureIndex);
}
}
});
return Collections.singleton(featureIndexCount).iterator();
}).reduce(RDFUpdate::merge);
}
代码示例来源:origin: mahmoudparsian/data-algorithms-book
static void debug(String record, Vector v) {
THE_LOGGER.info("DEBUG started:");
double[] d = v.toArray();
StringBuilder builder = new StringBuilder();
builder.append("DEBUG[record=");
builder.append(record);
builder.append("]:");
for (int i=0; i < d.length; i++){
builder.append("\t");
builder.append(d[i]);
}
THE_LOGGER.info(builder.toString());
}
代码示例来源:origin: org.apache.spark/spark-mllib_2.10
@Test
public void testConvertVectorColumnsToAndFromML() {
Vector x = Vectors.dense(2.0);
Dataset<Row> dataset = spark.createDataFrame(
Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class
).select("label", "features");
Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset);
Row new1 = newDataset1.first();
Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1);
Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first();
Assert.assertEquals(new1, new2);
Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first();
Assert.assertEquals(RowFactory.create(1.0, x), old1);
}
代码示例来源:origin: org.apache.spark/spark-mllib_2.10
@Test
@SuppressWarnings("unchecked")
public void testNormalVectorRDD() {
long m = 100L;
int n = 10;
int p = 2;
long seed = 1L;
JavaRDD<Vector> rdd1 = normalJavaVectorRDD(jsc, m, n);
JavaRDD<Vector> rdd2 = normalJavaVectorRDD(jsc, m, n, p);
JavaRDD<Vector> rdd3 = normalJavaVectorRDD(jsc, m, n, p, seed);
for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) {
Assert.assertEquals(m, rdd.count());
Assert.assertEquals(n, rdd.first().size());
}
}
代码示例来源:origin: org.apache.spark/spark-mllib
@Test
public void tfIdfMinimumDocumentFrequency() {
// The tests are to check Java compatibility.
HashingTF tf = new HashingTF();
@SuppressWarnings("unchecked")
JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList(
Arrays.asList("this is a sentence".split(" ")),
Arrays.asList("this is another sentence".split(" ")),
Arrays.asList("this is still a sentence".split(" "))), 2);
JavaRDD<Vector> termFreqs = tf.transform(documents);
termFreqs.collect();
IDF idf = new IDF(2);
JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs);
List<Vector> localTfIdfs = tfIdfs.collect();
int indexOfThis = tf.indexOf("this");
for (Vector v : localTfIdfs) {
Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15);
}
}
代码示例来源:origin: OryxProject/oryx
/**
* @param evalData points to cluster for evaluation
* @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid,
* and sum of squared distances
*/
JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) {
return evalData.mapToPair(vector -> {
double closestDist = Double.POSITIVE_INFINITY;
int minClusterID = Integer.MIN_VALUE;
double[] vec = vector.toArray();
for (ClusterInfo cluster : clusters.values()) {
double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec);
if (distance < closestDist) {
closestDist = distance;
minClusterID = cluster.getID();
}
}
Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist));
return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist));
}).reduceByKey(ClusterMetric::add);
}
代码示例来源:origin: ypriverol/spark-java8
LOGGER.info("Number of data records " + data.count());
for (Vector t : centroids) {
outputWriter.write("" + t.apply(0) + " " + t.apply(1) + " " + t.apply(2) + "\n");
内容来源于网络,如有侵权,请联系作者删除!