本文整理了Java中org.apache.spark.sql.functions.col()
方法的一些代码示例,展示了functions.col()
的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。functions.col()
方法的具体详情如下:
包路径:org.apache.spark.sql.functions
类名称:functions
方法名:col
暂无
代码示例来源:origin: org.apache.spark/spark-sql_2.11
@Test
public void testUDF() {
UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
String[] result = df.collectAsList().stream().map(row -> row.getString(0))
.toArray(String[]::new);
String[] expected = spark.table("testData").collectAsList().stream()
.map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
Assert.assertArrayEquals(expected, result);
}
}
代码示例来源:origin: org.apache.spark/spark-sql
@Test
public void testUDF() {
UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
String[] result = df.collectAsList().stream().map(row -> row.getString(0))
.toArray(String[]::new);
String[] expected = spark.table("testData").collectAsList().stream()
.map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
Assert.assertArrayEquals(expected, result);
}
}
代码示例来源:origin: org.apache.spark/spark-sql
@Test
public void testJoin() {
List<Integer> data = Arrays.asList(1, 2, 3);
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a");
List<Integer> data2 = Arrays.asList(2, 3, 4);
Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b");
Dataset<Tuple2<Integer, Integer>> joined =
ds.joinWith(ds2, col("a.value").equalTo(col("b.value")));
Assert.assertEquals(
Arrays.asList(tuple2(2, 2), tuple2(3, 3)),
joined.collectAsList());
}
代码示例来源:origin: org.apache.spark/spark-sql_2.11
@Test
public void testJoin() {
List<Integer> data = Arrays.asList(1, 2, 3);
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a");
List<Integer> data2 = Arrays.asList(2, 3, 4);
Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b");
Dataset<Tuple2<Integer, Integer>> joined =
ds.joinWith(ds2, col("a.value").equalTo(col("b.value")));
Assert.assertEquals(
Arrays.asList(tuple2(2, 2), tuple2(3, 3)),
joined.collectAsList());
}
代码示例来源:origin: org.apache.spark/spark-sql_2.10
@Test
public void testJoin() {
List<Integer> data = Arrays.asList(1, 2, 3);
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a");
List<Integer> data2 = Arrays.asList(2, 3, 4);
Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b");
Dataset<Tuple2<Integer, Integer>> joined =
ds.joinWith(ds2, col("a.value").equalTo(col("b.value")));
Assert.assertEquals(
Arrays.asList(tuple2(2, 2), tuple2(3, 3)),
joined.collectAsList());
}
代码示例来源:origin: org.apache.spark/spark-sql
@Test
public void testSampleBy() {
Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
Assert.assertEquals(0, actual.get(0).getLong(0));
Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
Assert.assertEquals(1, actual.get(1).getLong(0));
Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}
代码示例来源:origin: org.apache.spark/spark-sql_2.10
df.select(col("key"), col("value"));
df.selectExpr("key", "value + 1");
df.sort(col("key"), col("value"));
df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));
df2.select(exp(log("a")));
df2.select(pow("a", "a"), pow("b", 2.0));
df2.select(pow(col("a"), col("b")), exp("b"));
df2.select(sin("a"), acos("b"));
df2.select(col("*"), randn(5L));
代码示例来源:origin: org.apache.spark/spark-sql_2.11
@Test
public void testSampleBy() {
Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
Assert.assertEquals(0, actual.get(0).getLong(0));
Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
Assert.assertEquals(1, actual.get(1).getLong(0));
Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}
代码示例来源:origin: org.apache.spark/spark-sql_2.10
@Test
public void testSampleBy() {
Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
Assert.assertEquals(0, actual.get(0).getLong(0));
Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
Assert.assertEquals(1, actual.get(1).getLong(0));
Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}
代码示例来源:origin: org.apache.spark/spark-sql_2.11
df.select(col("key"), col("value"));
df.selectExpr("key", "value + 1");
df.sort(col("key"), col("value"));
df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));
df2.select(exp(log("a")));
df2.select(pow("a", "a"), pow("b", 2.0));
df2.select(pow(col("a"), col("b")), exp("b"));
df2.select(sin("a"), acos("b"));
df2.select(col("*"), randn(5L));
代码示例来源:origin: org.apache.spark/spark-sql
df.select(col("key"), col("value"));
df.selectExpr("key", "value + 1");
df.sort(col("key"), col("value"));
df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));
df2.select(exp(log("a")));
df2.select(pow("a", "a"), pow("b", 2.0));
df2.select(pow(col("a"), col("b")), exp("b"));
df2.select(sin("a"), acos("b"));
df2.select(col("*"), randn(5L));
代码示例来源:origin: org.apache.spark/spark-sql
@Test
public void testSelect() {
List<Integer> data = Arrays.asList(2, 6);
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
Dataset<Tuple2<Integer, String>> selected = ds.select(
expr("value + 1"),
col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));
Assert.assertEquals(
Arrays.asList(tuple2(3, "2"), tuple2(7, "6")),
selected.collectAsList());
}
代码示例来源:origin: org.apache.spark/spark-sql_2.11
@Test
public void testSelect() {
List<Integer> data = Arrays.asList(2, 6);
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
Dataset<Tuple2<Integer, String>> selected = ds.select(
expr("value + 1"),
col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));
Assert.assertEquals(
Arrays.asList(tuple2(3, "2"), tuple2(7, "6")),
selected.collectAsList());
}
代码示例来源:origin: org.apache.spark/spark-sql_2.10
@Test
public void testSelect() {
List<Integer> data = Arrays.asList(2, 6);
Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
Dataset<Tuple2<Integer, String>> selected = ds.select(
expr("value + 1"),
col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));
Assert.assertEquals(
Arrays.asList(tuple2(3, "2"), tuple2(7, "6")),
selected.collectAsList());
}
代码示例来源:origin: org.apache.spark/spark-sql_2.10
@Test
public void testBloomFilter() {
Dataset<Long> df = spark.range(1000);
BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter1.mightContain(i));
}
BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter2.mightContain(i * 3));
}
BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
Assert.assertEquals(64 * 5, filter3.bitSize());
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter3.mightContain(i));
}
BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
Assert.assertEquals(64 * 5, filter4.bitSize());
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter4.mightContain(i * 3));
}
}
代码示例来源:origin: org.apache.spark/spark-sql_2.11
@Test
public void testBloomFilter() {
Dataset<Long> df = spark.range(1000);
BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter1.mightContain(i));
}
BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter2.mightContain(i * 3));
}
BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
Assert.assertEquals(64 * 5, filter3.bitSize());
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter3.mightContain(i));
}
BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
Assert.assertEquals(64 * 5, filter4.bitSize());
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter4.mightContain(i * 3));
}
}
代码示例来源:origin: org.apache.spark/spark-sql
@Test
public void testBloomFilter() {
Dataset<Long> df = spark.range(1000);
BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter1.mightContain(i));
}
BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter2.mightContain(i * 3));
}
BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
Assert.assertEquals(64 * 5, filter3.bitSize());
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter3.mightContain(i));
}
BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
Assert.assertEquals(64 * 5, filter4.bitSize());
for (int i = 0; i < 1000; i++) {
Assert.assertTrue(filter4.mightContain(i * 3));
}
}
代码示例来源:origin: org.apache.spark/spark-sql
@Test
public void testCountMinSketch() {
Dataset<Long> df = spark.range(1000);
CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
Assert.assertEquals(1000, sketch1.totalCount());
Assert.assertEquals(10, sketch1.depth());
Assert.assertEquals(20, sketch1.width());
CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
Assert.assertEquals(1000, sketch2.totalCount());
Assert.assertEquals(10, sketch2.depth());
Assert.assertEquals(20, sketch2.width());
CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
Assert.assertEquals(1000, sketch3.totalCount());
Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4);
Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3);
CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
Assert.assertEquals(1000, sketch4.totalCount());
Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4);
Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3);
}
代码示例来源:origin: org.apache.spark/spark-sql_2.11
@Test
public void testCountMinSketch() {
Dataset<Long> df = spark.range(1000);
CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
Assert.assertEquals(1000, sketch1.totalCount());
Assert.assertEquals(10, sketch1.depth());
Assert.assertEquals(20, sketch1.width());
CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
Assert.assertEquals(1000, sketch2.totalCount());
Assert.assertEquals(10, sketch2.depth());
Assert.assertEquals(20, sketch2.width());
CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
Assert.assertEquals(1000, sketch3.totalCount());
Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4);
Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3);
CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
Assert.assertEquals(1000, sketch4.totalCount());
Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4);
Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3);
}
代码示例来源:origin: org.apache.spark/spark-sql_2.10
@Test
public void testCountMinSketch() {
Dataset<Long> df = spark.range(1000);
CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
Assert.assertEquals(1000, sketch1.totalCount());
Assert.assertEquals(10, sketch1.depth());
Assert.assertEquals(20, sketch1.width());
CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
Assert.assertEquals(1000, sketch2.totalCount());
Assert.assertEquals(10, sketch2.depth());
Assert.assertEquals(20, sketch2.width());
CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
Assert.assertEquals(1000, sketch3.totalCount());
Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4);
Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3);
CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
Assert.assertEquals(1000, sketch4.totalCount());
Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4);
Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3);
}
内容来源于网络,如有侵权,请联系作者删除!