us.codecraft.webmagic.Page.putField()方法的使用及代码示例

x33g5p2x  于2022-01-26 转载在 其他  
字(10.6k)|赞(0)|评价(0)|浏览(266)

本文整理了Java中us.codecraft.webmagic.Page.putField()方法的一些代码示例,展示了Page.putField()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Page.putField()方法的具体详情如下:
包路径:us.codecraft.webmagic.Page
类名称:Page
方法名:putField

Page.putField介绍

[英]store extract results
[中]存储提取结果

代码示例

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  if (page.getRawText() != null)
    page.putField("html", page.getRawText());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString());
  page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public RequestMatcher.MatchOther processPage(Page page) {
  page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
  return RequestMatcher.MatchOther.YES;
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> requests = page.getHtml().links().regex(urlPattern).all();
  //add urls to fetch
  page.addTargetRequests(requests);
  //extract by XPath
  page.putField("title", page.getHtml().xpath("//title"));
  page.putField("html", page.getHtml().toString());
  //extract by Readability
  page.putField("content", page.getHtml().smartContent());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
  page.addTargetRequests(requests);
  page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
  page.putField("content",page.getHtml().smartContent());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  //http://progressdaily.diandian.com/post/2013-01-24/40046867275
  //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
  // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
  List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
  page.addTargetRequests(requests);
  page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
  page.putField("content",page.getHtml().smartContent());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
  page.addTargetRequests(strings);
  page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
  page.putField("body",page.getHtml().smartContent());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
  page.addTargetRequests(strings);
  page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
  page.putField("body",page.getHtml().xpath("//dd"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  //http://progressdaily.diandian.com/post/2013-01-24/40046867275
  List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
  if (requests.size() > 2) {
    requests = requests.subList(0, 2);
  }
  page.addTargetRequests(requests);
  page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
  page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
  page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  //http://progressdaily.diandian.com/post/2013-01-24/40046867275
  int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
  page.addTargetRequest("http://kaichiba.com/shop/" + i);
  page.putField("title",page.getHtml().xpath("//Title"));
  page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> requests = page.getHtml().links().regex(".*article.*").all();
  page.addTargetRequests(requests);
  page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
  page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
  page.addTargetRequests(requests);
  requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
  page.addTargetRequests(requests);
  if (page.getUrl().toString().contains("thread")){
    page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
    page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
    page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
    page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
  }
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
  page.putField("title",page.getHtml().xpath("//title").toString());
  page.putField("content",page.getHtml().smartContent().toString());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
  page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
  page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
  page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
  if (page.getResultItems().get("title")==null){
    //skip this page
    page.setSkip(true);
  }
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  if (page.getUrl().regex(LIST_URL).match()) {
    List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
    if (CollectionUtils.isNotEmpty(ids)) {
      for (String id : ids) {
        page.addTargetRequest("http://angularjs.cn/api/article/" + id);
      }
    }
  } else {
    page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
    page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
  }
}

代码示例来源:origin: code4craft/webmagic

private void processDistrict(Page page) {
  String province = page.getRequest().getExtra("province").toString();
  String district = page.getRequest().getExtra("district").toString();
  String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
  page.putField("result", StringUtils.join(new String[]{province, district,
      zipCode}, "\t"));
  List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
  for (String link : links) {
    page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
  }
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
  page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
  page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
  page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
  if (page.getResultItems().get("name")==null){
    //skip this page
    page.setSkip(true);
  }
  page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all());
  List<String> all = page.getHtml().links().regex(".*\\.pdf").all();
  if (CollectionUtils.isNotEmpty(all)) {
    page.putField("pdf", all);
  } else {
    page.getResultItems().setSkip(true);
  }
}

代码示例来源:origin: code4craft/webmagic

@Override
public RequestMatcher.MatchOther processPage(Page page) {
  log.info("Extracting from " + page.getUrl());
  page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all());
  page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all());
  page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString());
  return RequestMatcher.MatchOther.YES;
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
  page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
  GithubRepo githubRepo = githubRepoPageMapper.get(page);
  if (githubRepo == null) {
    page.setSkip(true);
  } else {
    page.putField("repo", githubRepo);
  }
}

相关文章