org.apache.spark.sql.DataFrameWriter.parquet()方法的使用及代码示例

x33g5p2x  于2022-01-18 转载在 其他  
字(7.8k)|赞(0)|评价(0)|浏览(135)

本文整理了Java中org.apache.spark.sql.DataFrameWriter.parquet()方法的一些代码示例,展示了DataFrameWriter.parquet()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。DataFrameWriter.parquet()方法的具体详情如下:
包路径:org.apache.spark.sql.DataFrameWriter
类名称:DataFrameWriter
方法名:parquet

DataFrameWriter.parquet介绍

暂无

代码示例

代码示例来源:origin: mahmoudparsian/data-algorithms-book

schemaPeople.write().parquet("people.parquet");

代码示例来源:origin: mahmoudparsian/data-algorithms-book

schemaPeople.write().parquet("people.parquet");

代码示例来源:origin: mahmoudparsian/data-algorithms-book

sqlResult.write().parquet(output + "/parquetFormat"); // saves output in compressed Parquet format, recommended for large projects.
sqlResult.rdd().saveAsTextFile(output + "/textFormat"); // to see output via cat command

代码示例来源:origin: uber/uberscriptquery

public static void writeParquet(Dataset<Row> df, String outputPath, SaveMode saveMode, int numPartitions) {
  logger.info(String.format("Saving parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions));
  String hdfsOutputPath = outputPath;
  if (hdfsOutputPath.toLowerCase().startsWith(HDFS_PREFIX_LOWERCASE)) {
    hdfsOutputPath = hdfsOutputPath.substring(HDFS_PREFIX_LOWERCASE.length());
  }
  df.coalesce(numPartitions).write().mode(saveMode).parquet(hdfsOutputPath);
  logger.info(String.format("Saved parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions));
}

代码示例来源:origin: org.apache.spark/spark-sql_2.11

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

代码示例来源:origin: org.apache.spark/spark-sql_2.10

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

代码示例来源:origin: org.apache.spark/spark-sql

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

代码示例来源:origin: uber/marmaray

public void writeParquet() throws IOException {
  // TODO: Consider having a configuration to limit number records written out
  this.dataset.write().mode(SaveMode.Append).parquet(getDestWritePath().toString());
}

代码示例来源:origin: uber/uberscriptquery

@Override
  public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {

    String filePath = actionStatement.getParamValues().get(0).getValue().toString();
    String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
    String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();

    SaveMode saveMode = SaveMode.valueOf(saveModeStr);

    String sql = String.format("select * from %s", dfTableName);
    logger.info(String.format("Running sql [%s] to get data and then save it", sql));
    Dataset<Row> df = sparkSession.sql(sql);

    logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode));
    df.coalesce(1).write().mode(saveMode).parquet(filePath);
    logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode));
    return null;
  }
}

代码示例来源:origin: uk.gov.gchq.gaffer/parquet-store

.parquet(outputDir);
} else {
  LOGGER.debug("Skipping the sorting and aggregation of group: {}, due to no data existing in the temporary files directory: {}", group, tempFileDir);

代码示例来源:origin: cloudera-labs/envelope

case PARQUET_FORMAT:
 LOG.debug("Writing Parquet: {}", path);
 writer.parquet(path);
 break;
case CSV_FORMAT:

代码示例来源:origin: phuonglh/vn.vitk

break;
case PARQUET:
  output.write().parquet(outputFileName);
  break;
case TEXT:

代码示例来源:origin: phuonglh/vn.vitk

@Override
  public void saveImpl(String path) {
    // save metadata and params
    DefaultParamsWriter.saveMetadata(instance, path, sc(), 
        DefaultParamsWriter.saveMetadata$default$4(),
        DefaultParamsWriter.saveMetadata$default$5());
    // save model data: markovOrder, numLabels, weights
    Data data = new Data();
    data.setMarkovOrder(contextExtractor.getMarkovOrder().ordinal()+1);
    data.setWeights(weights);
    data.setTagDictionary(tagDictionary);
    List<Data> list = new LinkedList<Data>();
    list.add(data);
    String dataPath = new Path(path, "data").toString();
    sqlContext().createDataFrame(list, Data.class).write().parquet(dataPath);
    // save pipeline model
    try {
      String pipelinePath = new Path(path, "pipelineModel").toString(); 
      pipelineModel.write().overwrite().save(pipelinePath);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}

代码示例来源:origin: uk.gov.gchq.gaffer/parquet-store

.write()
.option("compression", "gzip")
.parquet(outputDir);

代码示例来源:origin: uk.gov.gchq.gaffer/parquet-store

.write()
.option("compression", "gzip")
.parquet(outputDir);

代码示例来源:origin: sectong/SparkToParquet

df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

代码示例来源:origin: KeithSSmith/spark-compaction

public void compact(String inputPath, String outputPath) throws IOException {
  this.setCompressionAndSerializationOptions(inputPath, outputPath);
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    // Another issue is that when using compression the compression codec extension is not being added to the file name.
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

代码示例来源:origin: KeithSSmith/spark-compaction

public void compact(String[] args) throws IOException {
  this.setCompressionAndSerializationOptions(this.parseCli(args));
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
      
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

相关文章