us.codecraft.webmagic.selector.Html类的使用及代码示例

x33g5p2x  于2022-01-20 转载在 其他  
字(9.9k)|赞(0)|评价(0)|浏览(194)

本文整理了Java中us.codecraft.webmagic.selector.Html类的一些代码示例,展示了Html类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Html类的具体详情如下:
包路径:us.codecraft.webmagic.selector.Html
类名称:Html

Html介绍

[英]Selectable html.
[中]可选择的html。

代码示例

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  //http://progressdaily.diandian.com/post/2013-01-24/40046867275
  List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
  if (requests.size() > 2) {
    requests = requests.subList(0, 2);
  }
  page.addTargetRequests(requests);
  page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
  page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
  page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
  page.addTargetRequests(relativeUrl);
  relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
  page.addTargetRequests(relativeUrl);
  List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
  boolean exist = false;
  for(String answer:answers){
    String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
    if(Integer.valueOf(vote) >= voteNum){
      page.putField("vote",vote);
      page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
      page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
      exist = true;
    }
  }
  if(!exist){
    page.setSkip(true);
  }
}

代码示例来源:origin: code4craft/webmagic

public void process(Page page) {
  Html html = page.getHtml();
  List<String> questionList =  html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
  if(questionList != null && questionList.size() > 1)
  {
    //i=0是列名称,所以i从1开始
    for( int i = 1 ; i < questionList.size(); i++)
    {
      System.out.println(questionList.get(i));
      Html tempHtml =  Html.create("<table>"+questionList.get(i)+"</table>");
      String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
      System.out.println(comment);
      String answerNum =  tempHtml.xpath("//td[@class='num']/text()").toString();
      System.out.println(answerNum);
      String createTime = tempHtml.xpath("//td[3]/text()").toString();
      System.out.println(createTime);
      /* Document doc = Jsoup.parse(questionList.get(i));
       Html hmt  = Html.create(questionList.get(i)) ;
       String str = hmt.links().toString();
       String   content =   doc.getElementsByTag("a").text();
       String ss = doc.text();*/
    }
  }
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
  page.addTargetRequests(requests);
  page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
  page.putField("content",page.getHtml().smartContent());
}

代码示例来源:origin: code4craft/webmagic

@Override
public RequestMatcher.MatchOther processPage(Page page) {
  page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
  return RequestMatcher.MatchOther.YES;
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
  page.putField("title",page.getHtml().xpath("//title").toString());
  page.putField("content",page.getHtml().smartContent().toString());
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
  page.addTargetRequests(strings);
  page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
  page.putField("body",page.getHtml().xpath("//dd"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  List<String> requests = page.getHtml().links().regex(urlPattern).all();
  //add urls to fetch
  page.addTargetRequests(requests);
  //extract by XPath
  page.putField("title", page.getHtml().xpath("//title"));
  page.putField("html", page.getHtml().toString());
  //extract by Readability
  page.putField("content", page.getHtml().smartContent());
}

代码示例来源:origin: zyongjava/spider

@Override
public void process(Page page) {
  List pagination = page.getHtml().links().regex("/xf/.*").all();
  String region = page.getHtml().xpath("//div[@class=pro_crum]/a[3]/text()").toString();
  String name = page.getHtml().xpath("//div[@class=pro_crum]/a[4]/text()").toString();
  List<String> buildings = page.getHtml().xpath("//div[@id=houseList]/dl/dd/div[@class=lptabl]/table/tbody/tr").all();
  if (CollectionUtils.isEmpty(buildings)) {
    page.setSkip(true);
    br.append("<table>").append(build).append("</table>");
    Html html = new Html(br.toString());
    String building = html.xpath("//tr/td[1]/text()").toString();
    String unit = html.xpath("//tr/td[2]/text()").toString();
    String floorNumber = html.xpath("//tr/td[3]/text()").toString();
    String doorNumber = html.xpath("//tr/td[4]/text()").toString();
    String area = html.xpath("//tr/td[5]/text()").toString();
    String houseType = html.xpath("//tr/td[6]/text()").toString();
    String price = html.xpath("//tr/td[7]/text()").toString();

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString());
  page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}

代码示例来源:origin: code4craft/webmagic

@Override
public void process(Page page) {
  page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all());
  List<String> all = page.getHtml().links().regex(".*\\.pdf").all();
  if (CollectionUtils.isNotEmpty(all)) {
    page.putField("pdf", all);
  } else {
    page.getResultItems().setSkip(true);
  }
}

代码示例来源:origin: code4craft/webmagic

public static Html create(String text) {
  return new Html(text);
}

代码示例来源:origin: code4craft/webmagic

private void processDistrict(Page page) {
  String province = page.getRequest().getExtra("province").toString();
  String district = page.getRequest().getExtra("district").toString();
  String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
  page.putField("result", StringUtils.join(new String[]{province, district,
      zipCode}, "\t"));
  List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
  for (String link : links) {
    page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
  }
}

代码示例来源:origin: code4craft/webmagic

private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
  List<String> links;
  if (urlRegionSelector == null) {
    links = page.getHtml().links().all();
  } else {
    links = page.getHtml().selectList(urlRegionSelector).links().all();
  }
  for (String link : links) {
    for (Pattern targetUrlPattern : urlPatterns) {
      Matcher matcher = targetUrlPattern.matcher(link);
      if (matcher.find()) {
        page.addTargetRequest(new Request(matcher.group(0)));
      }
    }
  }
}

代码示例来源:origin: stackoverflow.com

Html html = new Html(null) {

      Head head = new Head(this);

      Body body = new Body(this) {

        Blank blank = new Blank(this, "Hello World");

      };

};
// prepends the doc type <!DOCTYPE html>
html.setPrependDocType(true);
System.out.println(html.toHtmlString());

代码示例来源:origin: stackoverflow.com

Html html = new Html(null) {{
  new Head(this);
  new Body(this) {{
    new H1(this) {{
      new NoTag(this, "spacial characters taken from an external resource like file/database :  女 学校 ä ö ü Ä");
    }};
  }};
}};

System.out.println(html.toHtmlString(StandardCharsets.UTF_8));

代码示例来源:origin: stackoverflow.com

Style mainDivStyle = new Style();
mainDivStyle.addCssProperties(AlignItems.CENTER, new BackgroundColor(
    CssColorName.AQUA.getColorName()));

Html html = new Html(null) {
  Body body = new Body(this) {
    Div mainDiv = new Div(this, mainDivStyle);
    Footer footer = new Footer(this);
  };
};

try {

  //to write to a file
  html.toOutputStream(new FileOutputStream("/home/ansgar/html_work/html-by-wffweb.html"));

  //to print in to console
  //System.out.println(html.toHtmlString());
} catch (FileNotFoundException e) {
  e.printStackTrace();
} catch (IOException e) {
  e.printStackTrace();
}

代码示例来源:origin: stackoverflow.com

Listcell c = new Listcell();
Html h = new Html();
h.setContent("MY HTML STRING HERE");
c.appendChild(h);
listitem.appendChild(c);

代码示例来源:origin: stackoverflow.com

public AbstractHtml render() {
  Html html = new Html(null) {
  html.setPrependDocType(true);
  return html;

代码示例来源:origin: code4craft/webmagic

private void processProvince(Page page) {
  //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
  List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
  Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
  for (String district : districts) {
    Matcher matcher = pattern.matcher(district);
    while (matcher.find()) {
      String title = matcher.group(1);
      String link = matcher.group(2);
      Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
      page.addTargetRequest(request);
    }
  }
}

相关文章

微信公众号

最新文章

更多