本文整理了Java中us.codecraft.webmagic.selector.Html
类的一些代码示例,展示了Html
类的具体用法。这些代码示例主要来源于Github
/Stackoverflow
/Maven
等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。Html
类的具体详情如下:
包路径:us.codecraft.webmagic.selector.Html
类名称:Html
[英]Selectable html.
[中]可选择的html。
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
//http://progressdaily.diandian.com/post/2013-01-24/40046867275
List<String> requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").all();
if (requests.size() > 2) {
requests = requests.subList(0, 2);
}
page.addTargetRequests(requests);
page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").all());
page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]/text()"));
page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]/text()"));
}
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
page.addTargetRequests(relativeUrl);
relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
page.addTargetRequests(relativeUrl);
List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
boolean exist = false;
for(String answer:answers){
String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
if(Integer.valueOf(vote) >= voteNum){
page.putField("vote",vote);
page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
exist = true;
}
}
if(!exist){
page.setSkip(true);
}
}
代码示例来源:origin: code4craft/webmagic
public void process(Page page) {
Html html = page.getHtml();
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
if(questionList != null && questionList.size() > 1)
{
//i=0是列名称,所以i从1开始
for( int i = 1 ; i < questionList.size(); i++)
{
System.out.println(questionList.get(i));
Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>");
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
System.out.println(comment);
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
System.out.println(answerNum);
String createTime = tempHtml.xpath("//td[3]/text()").toString();
System.out.println(createTime);
/* Document doc = Jsoup.parse(questionList.get(i));
Html hmt = Html.create(questionList.get(i)) ;
String str = hmt.links().toString();
String content = doc.getElementsByTag("a").text();
String ss = doc.text();*/
}
}
}
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
page.putField("content",page.getHtml().smartContent());
}
代码示例来源:origin: code4craft/webmagic
@Override
public RequestMatcher.MatchOther processPage(Page page) {
page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
return RequestMatcher.MatchOther.YES;
}
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
page.putField("title",page.getHtml().xpath("//title").toString());
page.putField("content",page.getHtml().smartContent().toString());
}
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
page.addTargetRequests(strings);
page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
page.putField("body",page.getHtml().xpath("//dd"));
}
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(urlPattern).all();
//add urls to fetch
page.addTargetRequests(requests);
//extract by XPath
page.putField("title", page.getHtml().xpath("//title"));
page.putField("html", page.getHtml().toString());
//extract by Readability
page.putField("content", page.getHtml().smartContent());
}
代码示例来源:origin: zyongjava/spider
@Override
public void process(Page page) {
List pagination = page.getHtml().links().regex("/xf/.*").all();
String region = page.getHtml().xpath("//div[@class=pro_crum]/a[3]/text()").toString();
String name = page.getHtml().xpath("//div[@class=pro_crum]/a[4]/text()").toString();
List<String> buildings = page.getHtml().xpath("//div[@id=houseList]/dl/dd/div[@class=lptabl]/table/tbody/tr").all();
if (CollectionUtils.isEmpty(buildings)) {
page.setSkip(true);
br.append("<table>").append(build).append("</table>");
Html html = new Html(br.toString());
String building = html.xpath("//tr/td[1]/text()").toString();
String unit = html.xpath("//tr/td[2]/text()").toString();
String floorNumber = html.xpath("//tr/td[3]/text()").toString();
String doorNumber = html.xpath("//tr/td[4]/text()").toString();
String area = html.xpath("//tr/td[5]/text()").toString();
String houseType = html.xpath("//tr/td[6]/text()").toString();
String price = html.xpath("//tr/td[7]/text()").toString();
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
代码示例来源:origin: code4craft/webmagic
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all());
List<String> all = page.getHtml().links().regex(".*\\.pdf").all();
if (CollectionUtils.isNotEmpty(all)) {
page.putField("pdf", all);
} else {
page.getResultItems().setSkip(true);
}
}
代码示例来源:origin: code4craft/webmagic
public static Html create(String text) {
return new Html(text);
}
代码示例来源:origin: code4craft/webmagic
private void processDistrict(Page page) {
String province = page.getRequest().getExtra("province").toString();
String district = page.getRequest().getExtra("district").toString();
String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
page.putField("result", StringUtils.join(new String[]{province, district,
zipCode}, "\t"));
List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
for (String link : links) {
page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
}
}
代码示例来源:origin: code4craft/webmagic
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
List<String> links;
if (urlRegionSelector == null) {
links = page.getHtml().links().all();
} else {
links = page.getHtml().selectList(urlRegionSelector).links().all();
}
for (String link : links) {
for (Pattern targetUrlPattern : urlPatterns) {
Matcher matcher = targetUrlPattern.matcher(link);
if (matcher.find()) {
page.addTargetRequest(new Request(matcher.group(0)));
}
}
}
}
代码示例来源:origin: stackoverflow.com
Html html = new Html(null) {
Head head = new Head(this);
Body body = new Body(this) {
Blank blank = new Blank(this, "Hello World");
};
};
// prepends the doc type <!DOCTYPE html>
html.setPrependDocType(true);
System.out.println(html.toHtmlString());
代码示例来源:origin: stackoverflow.com
Html html = new Html(null) {{
new Head(this);
new Body(this) {{
new H1(this) {{
new NoTag(this, "spacial characters taken from an external resource like file/database : 女 学校 ä ö ü Ä");
}};
}};
}};
System.out.println(html.toHtmlString(StandardCharsets.UTF_8));
代码示例来源:origin: stackoverflow.com
Style mainDivStyle = new Style();
mainDivStyle.addCssProperties(AlignItems.CENTER, new BackgroundColor(
CssColorName.AQUA.getColorName()));
Html html = new Html(null) {
Body body = new Body(this) {
Div mainDiv = new Div(this, mainDivStyle);
Footer footer = new Footer(this);
};
};
try {
//to write to a file
html.toOutputStream(new FileOutputStream("/home/ansgar/html_work/html-by-wffweb.html"));
//to print in to console
//System.out.println(html.toHtmlString());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
代码示例来源:origin: stackoverflow.com
Listcell c = new Listcell();
Html h = new Html();
h.setContent("MY HTML STRING HERE");
c.appendChild(h);
listitem.appendChild(c);
代码示例来源:origin: stackoverflow.com
public AbstractHtml render() {
Html html = new Html(null) {
html.setPrependDocType(true);
return html;
代码示例来源:origin: code4craft/webmagic
private void processProvince(Page page) {
//这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
for (String district : districts) {
Matcher matcher = pattern.matcher(district);
while (matcher.find()) {
String title = matcher.group(1);
String link = matcher.group(2);
Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
page.addTargetRequest(request);
}
}
}
内容来源于网络,如有侵权,请联系作者删除!