org.apache.spark.sql.functions.col()方法的使用及代码示例

x33g5p2x  于2022-01-19 转载在 其他  
字(15.1k)|赞(0)|评价(0)|浏览(204)

本文整理了Java中org.apache.spark.sql.functions.col()方法的一些代码示例,展示了functions.col()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。functions.col()方法的具体详情如下:
包路径:org.apache.spark.sql.functions
类名称:functions
方法名:col

functions.col介绍

暂无

代码示例

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
 public void testUDF() {
  UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
  Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
  String[] result = df.collectAsList().stream().map(row -> row.getString(0))
   .toArray(String[]::new);
  String[] expected = spark.table("testData").collectAsList().stream()
   .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
  Assert.assertArrayEquals(expected, result);
 }
}

代码示例来源:origin: org.apache.spark/spark-sql

@Test
 public void testUDF() {
  UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
  Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
  String[] result = df.collectAsList().stream().map(row -> row.getString(0))
   .toArray(String[]::new);
  String[] expected = spark.table("testData").collectAsList().stream()
   .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
  Assert.assertArrayEquals(expected, result);
 }
}

代码示例来源:origin: org.apache.spark/spark-sql

@Test
public void testJoin() {
 List<Integer> data = Arrays.asList(1, 2, 3);
 Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a");
 List<Integer> data2 = Arrays.asList(2, 3, 4);
 Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b");
 Dataset<Tuple2<Integer, Integer>> joined =
  ds.joinWith(ds2, col("a.value").equalTo(col("b.value")));
 Assert.assertEquals(
  Arrays.asList(tuple2(2, 2), tuple2(3, 3)),
  joined.collectAsList());
}

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
public void testJoin() {
 List<Integer> data = Arrays.asList(1, 2, 3);
 Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a");
 List<Integer> data2 = Arrays.asList(2, 3, 4);
 Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b");
 Dataset<Tuple2<Integer, Integer>> joined =
  ds.joinWith(ds2, col("a.value").equalTo(col("b.value")));
 Assert.assertEquals(
  Arrays.asList(tuple2(2, 2), tuple2(3, 3)),
  joined.collectAsList());
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

@Test
public void testJoin() {
 List<Integer> data = Arrays.asList(1, 2, 3);
 Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a");
 List<Integer> data2 = Arrays.asList(2, 3, 4);
 Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b");
 Dataset<Tuple2<Integer, Integer>> joined =
  ds.joinWith(ds2, col("a.value").equalTo(col("b.value")));
 Assert.assertEquals(
  Arrays.asList(tuple2(2, 2), tuple2(3, 3)),
  joined.collectAsList());
}

代码示例来源:origin: org.apache.spark/spark-sql

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

df.select(col("key"), col("value"));
df.selectExpr("key", "value + 1");
df.sort(col("key"), col("value"));
df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));
df2.select(exp(log("a")));
df2.select(pow("a", "a"), pow("b", 2.0));
df2.select(pow(col("a"), col("b")), exp("b"));
df2.select(sin("a"), acos("b"));
df2.select(col("*"), randn(5L));

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

代码示例来源:origin: org.apache.spark/spark-sql_2.11

df.select(col("key"), col("value"));
df.selectExpr("key", "value + 1");
df.sort(col("key"), col("value"));
df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));
df2.select(exp(log("a")));
df2.select(pow("a", "a"), pow("b", 2.0));
df2.select(pow(col("a"), col("b")), exp("b"));
df2.select(sin("a"), acos("b"));
df2.select(col("*"), randn(5L));

代码示例来源:origin: org.apache.spark/spark-sql

df.select(col("key"), col("value"));
df.selectExpr("key", "value + 1");
df.sort(col("key"), col("value"));
df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));
df2.select(exp(log("a")));
df2.select(pow("a", "a"), pow("b", 2.0));
df2.select(pow(col("a"), col("b")), exp("b"));
df2.select(sin("a"), acos("b"));
df2.select(col("*"), randn(5L));

代码示例来源:origin: org.apache.spark/spark-sql

@Test
public void testSelect() {
 List<Integer> data = Arrays.asList(2, 6);
 Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
 Dataset<Tuple2<Integer, String>> selected = ds.select(
  expr("value + 1"),
  col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));
 Assert.assertEquals(
  Arrays.asList(tuple2(3, "2"), tuple2(7, "6")),
  selected.collectAsList());
}

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
public void testSelect() {
 List<Integer> data = Arrays.asList(2, 6);
 Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
 Dataset<Tuple2<Integer, String>> selected = ds.select(
  expr("value + 1"),
  col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));
 Assert.assertEquals(
  Arrays.asList(tuple2(3, "2"), tuple2(7, "6")),
  selected.collectAsList());
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

@Test
public void testSelect() {
 List<Integer> data = Arrays.asList(2, 6);
 Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());
 Dataset<Tuple2<Integer, String>> selected = ds.select(
  expr("value + 1"),
  col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING()));
 Assert.assertEquals(
  Arrays.asList(tuple2(3, "2"), tuple2(7, "6")),
  selected.collectAsList());
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

@Test
public void testBloomFilter() {
 Dataset<Long> df = spark.range(1000);
 BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
 Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter1.mightContain(i));
 }
 BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
 Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter2.mightContain(i * 3));
 }
 BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter3.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter3.mightContain(i));
 }
 BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter4.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter4.mightContain(i * 3));
 }
}

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
public void testBloomFilter() {
 Dataset<Long> df = spark.range(1000);
 BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
 Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter1.mightContain(i));
 }
 BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
 Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter2.mightContain(i * 3));
 }
 BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter3.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter3.mightContain(i));
 }
 BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter4.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter4.mightContain(i * 3));
 }
}

代码示例来源:origin: org.apache.spark/spark-sql

@Test
public void testBloomFilter() {
 Dataset<Long> df = spark.range(1000);
 BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
 Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter1.mightContain(i));
 }
 BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
 Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter2.mightContain(i * 3));
 }
 BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter3.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter3.mightContain(i));
 }
 BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
 Assert.assertEquals(64 * 5, filter4.bitSize());
 for (int i = 0; i < 1000; i++) {
  Assert.assertTrue(filter4.mightContain(i * 3));
 }
}

代码示例来源:origin: org.apache.spark/spark-sql

@Test
public void testCountMinSketch() {
 Dataset<Long> df = spark.range(1000);
 CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
 Assert.assertEquals(1000, sketch1.totalCount());
 Assert.assertEquals(10, sketch1.depth());
 Assert.assertEquals(20, sketch1.width());
 CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
 Assert.assertEquals(1000, sketch2.totalCount());
 Assert.assertEquals(10, sketch2.depth());
 Assert.assertEquals(20, sketch2.width());
 CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
 Assert.assertEquals(1000, sketch3.totalCount());
 Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4);
 Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3);
 CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
 Assert.assertEquals(1000, sketch4.totalCount());
 Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4);
 Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3);
}

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
public void testCountMinSketch() {
 Dataset<Long> df = spark.range(1000);
 CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
 Assert.assertEquals(1000, sketch1.totalCount());
 Assert.assertEquals(10, sketch1.depth());
 Assert.assertEquals(20, sketch1.width());
 CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
 Assert.assertEquals(1000, sketch2.totalCount());
 Assert.assertEquals(10, sketch2.depth());
 Assert.assertEquals(20, sketch2.width());
 CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
 Assert.assertEquals(1000, sketch3.totalCount());
 Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4);
 Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3);
 CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
 Assert.assertEquals(1000, sketch4.totalCount());
 Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4);
 Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3);
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

@Test
public void testCountMinSketch() {
 Dataset<Long> df = spark.range(1000);
 CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
 Assert.assertEquals(1000, sketch1.totalCount());
 Assert.assertEquals(10, sketch1.depth());
 Assert.assertEquals(20, sketch1.width());
 CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
 Assert.assertEquals(1000, sketch2.totalCount());
 Assert.assertEquals(10, sketch2.depth());
 Assert.assertEquals(20, sketch2.width());
 CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
 Assert.assertEquals(1000, sketch3.totalCount());
 Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4);
 Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3);
 CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
 Assert.assertEquals(1000, sketch4.totalCount());
 Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4);
 Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3);
}

相关文章