org.apache.spark.sql.DataFrame.javaRDD()方法的使用及代码示例

x33g5p2x  于2022-01-18 转载在 其他  
字(6.4k)|赞(0)|评价(0)|浏览(287)

本文整理了Java中org.apache.spark.sql.DataFrame.javaRDD()方法的一些代码示例,展示了DataFrame.javaRDD()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。DataFrame.javaRDD()方法的具体详情如下:
包路径:org.apache.spark.sql.DataFrame
类名称:DataFrame
方法名:javaRDD

DataFrame.javaRDD介绍

暂无

代码示例

代码示例来源:origin: phuonglh/vn.vitk

private JavaRDD<String> toTaggedSentence(DataFrame output) {
  return output.javaRDD().map(new Function<Row, String>() {
    private static final long serialVersionUID = 4208643510231783579L;
    @Override
    public String call(Row row) throws Exception {
      String[] tokens = row.getString(0).trim().split("\\s+");
      String[] tags = row.getString(1).trim().split("\\s+");
      if (tokens.length != tags.length) {
        System.err.println("Incompatible lengths!");
        return null;
      }
      StringBuilder sb = new StringBuilder(64);
      for (int j = 0; j < tokens.length; j++) {
        sb.append(tokens[j]);
        sb.append('/');
        sb.append(tags[j]);
        sb.append(' ');
      }
      return sb.toString().trim();
    }
  });
}

代码示例来源:origin: stackoverflow.com

DataFrame parquetData = sqlContext.read().parquet("/Users/leewallen/dev/spark_data/out/ParquetData");
parquetData.registerTempTable("pd");
DataFrame idGroupsDataFrame = sqlContext.sql("select id_groups.ids from pd");

List<String> idList = idGroupsDataFrame.javaRDD()
                    .map((Function<Row, String>) row -> {
  List<String> ids = new ArrayList<>();
  List<WrappedArray<String>> wrappedArrayList = row.getList(0);
  java.util.Iterator<WrappedArray<String>> wrappedArrayIterator = wrappedArrayList.iterator();
  while (wrappedArrayIterator.hasNext()) {
    WrappedArray<String> idWrappedArray = wrappedArrayIterator.next();
    Iterator<String> stringIter = idWrappedArray.iterator();
    List<String> tempIds = new ArrayList<>();
    while (stringIter.hasNext()) {
      tempIds.add(stringIter.next());
    }

    ids.add(tempIds.stream()
            .reduce((s1, s2) -> String.format("%s,%s", s1, s2))
            .get());
  }

  return ids.stream()
       .reduce((s1, s2) -> String.format("%s|%s", s1, s2))
       .get();
}).collect();

idList.forEach(id -> System.out.println(id));

代码示例来源:origin: psal/jstylo

/**
 * Converts the DataMap to JavaRDD<LabeledPoint> which is what most Spark mllib classes need to perform classification.
 * @param sql
 * @param map
 * @param labels
 * @return
 */
public static JavaRDD<LabeledPoint> DataFrameToLabeledPointRDD(DataFrame df){
  return df.javaRDD().map(new LabeledFromRow());
}

代码示例来源:origin: phuonglh/vn.vitk

/**
 * Evaluates the accuracy of a CMM model on a data frame on tagged sentences.
 * @param dataset
 * @return evaluation measures.
 */
public float[] evaluate(DataFrame dataset) {
  List<String> correctSequences = dataset.javaRDD().map(new RowToStringFunction(1)).collect();
  long beginTime = System.currentTimeMillis();
  DataFrame output = cmmModel.transform(dataset);
  long endTime = System.currentTimeMillis();
  if (verbose) {
    System.out.println(" Number of sentences = " + correctSequences.size());
    long duration = (endTime - beginTime);
    System.out.println("  Total tagging time = " + duration + " ms.");
    System.out.println("Average tagging time = " + ((float)duration) / correctSequences.size() + " ms.");
  }
  List<String> automaticSequences = output.javaRDD().map(new RowToStringFunction(1)).collect();
  return Evaluator.evaluate(automaticSequences, correctSequences);
}

代码示例来源:origin: oeljeklaus-you/UserActionAnalyzePlatform

/**
 * 获取指定日期范围内的数据
 * @param sc
 * @param taskParam
 * @return
 */
private static JavaRDD<Row> getActionRDD(SQLContext sc, JSONObject taskParam)
{
  String startTime=ParamUtils.getParam(taskParam,Constants.PARAM_STARTTIME);
  String endTime=ParamUtils.getParam(taskParam,Constants.PARAM_ENDTIME);
  String sql="select *from user_visit_action where date>='"+startTime+"' and date<='"+endTime+"'";
  DataFrame df=sc.sql(sql);
  return df.javaRDD();
}

代码示例来源:origin: Erik-ly/SprakProject

/**
 * 获取指定日期范围内的用户访问行为数据
 * @param sqlContext SQLContext
 * @param taskParam 任务参数
 * @return 行为数据RDD
 */
private static JavaRDD<Row> getActionRDDByDateRange(
    SQLContext sqlContext, JSONObject taskParam) {
  
  //先在Constants.java中添加任务相关的常量
  //String PARAM_START_DATE = "startDate";
  //String PARAM_END_DATE = "endDate";
  String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
  String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);
  
  String sql = "select * "
      + "from user_visit_action"
      + "where date>='" + startDate + "'"
      + "and date<='" + endDate + "'";
  
  DataFrame actionDF = sqlContext.sql(sql);
  
  return actionDF.javaRDD();
}

代码示例来源:origin: oeljeklaus-you/UserActionAnalyzePlatform

JavaRDD<Row> userInfoRDD=sc.sql(sql).javaRDD();

代码示例来源:origin: phuonglh/vn.vitk

/**
 * Tags a list of sequences and returns a list of tag sequences.
 * @param sentences
 * @return a list of tagged sequences.
 */
public List<String> tag(List<String> sentences) {
  List<Row> rows = new LinkedList<Row>();
  for (String sentence : sentences) {
    rows.add(RowFactory.create(sentence));
  }
  StructType schema = new StructType(new StructField[]{
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty())	
  });
  SQLContext sqlContext = new SQLContext(jsc);
  DataFrame input = sqlContext.createDataFrame(rows, schema);
  if (cmmModel != null) {
    DataFrame output = cmmModel.transform(input).repartition(1);
    return output.javaRDD().map(new RowToStringFunction(1)).collect();
  } else {
    System.err.println("Tagging model is null. You need to create or load a model first.");
    return null;
  }
}

代码示例来源:origin: Erik-ly/SprakProject

JavaRDD<Row> userInfoRDD = sqlContext.sql(sql).javaRDD();

代码示例来源:origin: phuonglh/vn.vitk

@Override
public DataFrame transform(DataFrame dataset) {
  JavaRDD<Row> output = dataset.javaRDD().map(new DecodeFunction());
  StructType schema = new StructType(new StructField[]{
    new StructField("sentence", DataTypes.StringType, false, Metadata.empty()),
    new StructField("prediction", DataTypes.StringType, false, Metadata.empty())
  });
  return dataset.sqlContext().createDataFrame(output, schema);
}

代码示例来源:origin: phuonglh/vn.vitk

MarkovOrder order = MarkovOrder.values()[(Integer)params.getOrDefault(params.getMarkovOrder())-1];
ContextExtractor contextExtractor = new ContextExtractor(order, Constants.REGEXP_FILE);
JavaRDD<LabeledContext> contexts = contextExtractor.extract(dataset.javaRDD());
DataFrame dataFrame = dataset.sqlContext().createDataFrame(contexts, LabeledContext.class);
JavaRDD<Row> wt = df.select("word", "label").javaRDD();
JavaPairRDD<String, Set<Integer>> tagDictionary = wt.mapToPair(new PairFunction<Row, String, Set<Integer>>(){
  private static final long serialVersionUID = 5865372074294028547L;

相关文章