edu.uci.ics.crawler4j.url.WebURL类的使用及代码示例

x33g5p2x  于2022-02-03 转载在 其他  
字(9.4k)|赞(0)|评价(0)|浏览(118)

本文整理了Java中edu.uci.ics.crawler4j.url.WebURL类的一些代码示例,展示了WebURL类的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL类的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL

WebURL介绍

暂无

代码示例

代码示例来源:origin: yasserg/crawler4j

/**
 * Emitted when the crawler is redirected to an invalid Location.
 * @param page
 */
protected void onRedirectedToInvalidUrl(Page page) {
  logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
    page.url.getURL());
}

代码示例来源:origin: yasserg/crawler4j

WebURL webURL = new WebURL();
    webURL.setTldList(myController.getTldList());
    webURL.setURL(movedToUrl);
    webURL.setParentDocid(curURL.getParentDocid());
    webURL.setParentUrl(curURL.getParentUrl());
    webURL.setDepth(curURL.getDepth());
    webURL.setDocid(-1);
    webURL.setAnchor(curURL.getAnchor());
    if (shouldVisit(page, webURL)) {
      if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
        webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
        frontier.schedule(webURL);
      } else {
        logger.debug(
          "Not visiting: {} as per the server's \"robots.txt\" policy",
          webURL.getURL());
             webURL.getURL());
             fetchResult.getEntity().getContentType() == null ? "" :
             fetchResult.getEntity().getContentType().getValue();
  onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(),
              contentType, description);
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
  if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
    logger.debug("Redirect page: {} has already been seen", curURL);
    return;

代码示例来源:origin: yasserg/crawler4j

private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException {
  Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());
  final String pagePath = referringPage.getPath();
  final String pageUrl = referringPage.getURL();
  Set<WebURL> outgoingUrls = new HashSet<>();
  for (String url : extractedUrls) {
    String relative = getLinkRelativeTo(pagePath, url);
    String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);
    WebURL webURL = new WebURL();
    webURL.setURL(absolute);
    outgoingUrls.add(webURL);
  }
  return outgoingUrls;
}

代码示例来源:origin: yasserg/crawler4j

protected static DatabaseEntry getDatabaseEntryKey(WebURL url) {
  byte[] keyData = new byte[6];
  keyData[0] = url.getPriority();
  keyData[1] = ((url.getDepth() > Byte.MAX_VALUE) ? Byte.MAX_VALUE : (byte) url.getDepth());
  Util.putIntInByteArray(url.getDocid(), keyData, 2);
  return new DatabaseEntry(keyData);
}

代码示例来源:origin: yasserg/crawler4j

@Override
public WebURL entryToObject(TupleInput input) {
  WebURL webURL = new WebURL();
  webURL.setURL(input.readString());
  webURL.setDocid(input.readInt());
  webURL.setParentDocid(input.readInt());
  webURL.setParentUrl(input.readString());
  webURL.setDepth(input.readShort());
  webURL.setPriority(input.readByte());
  webURL.setAnchor(input.readString());
  return webURL;
}

代码示例来源:origin: yasserg/crawler4j

@Override
  public void objectToEntry(WebURL url, TupleOutput output) {
    output.writeString(url.getURL());
    output.writeInt(url.getDocid());
    output.writeInt(url.getParentDocid());
    output.writeString(url.getParentUrl());
    output.writeShort(url.getDepth());
    output.writeByte(url.getPriority());
    output.writeString(url.getAnchor());
  }
}

代码示例来源:origin: yasserg/crawler4j

WebURL webUrl = new WebURL();
webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {
  frontier.schedule(webUrl);

代码示例来源:origin: biezhi/java-library-examples

int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();

代码示例来源:origin: biezhi/java-library-examples

private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchPage(curURL);
      if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
        Page page = new Page(curURL);
        fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize());
        parser.parse(page, curURL.getURL());
        return page;
      }
    } catch (Exception e) {
      logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
    return null;
  }
}

代码示例来源:origin: edu.uci.ics/crawler4j

public static Set<WebURL> extractUrls(String input) {
  Set<WebURL> extractedUrls = new HashSet<>();
  if (input != null) {
    Matcher matcher = pattern.matcher(input);
    while (matcher.find()) {
      WebURL webURL = new WebURL();
      String urlStr = matcher.group();
      if (!urlStr.startsWith("http")) {
        urlStr = "http://" + urlStr;
      }
      webURL.setURL(urlStr);
      extractedUrls.add(webURL);
    }
  }
  return extractedUrls;
}

代码示例来源:origin: yasserg/crawler4j

String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
  WebURL webURL = new WebURL();
  webURL.setTldList(tldList);
  webURL.setURL(url);
  webURL.setTag(urlAnchorPair.getTag());
  webURL.setAnchor(urlAnchorPair.getAnchor());
  webURL.setAttributes(urlAnchorPair.getAttributes());
  outgoingUrls.add(webURL);
  urlCount++;

代码示例来源:origin: biezhi/java-library-examples

@Override
  public void visit(Page page) {
    int    docid       = page.getWebURL().getDocid();
    String url         = page.getWebURL().getURL();
    int    parentDocid = page.getWebURL().getParentDocid();

    logger.debug("Docid: {}", docid);
    logger.info("URL: {}", url);
    logger.debug("Docid of parent page: {}", parentDocid);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String        text          = htmlParseData.getText();
      String        html          = htmlParseData.getHtml();
      Set<WebURL>   links         = htmlParseData.getOutgoingUrls();

      logger.debug("Text length: {}", text.length());
      logger.debug("Html length: {}", html.length());
      logger.debug("Number of outgoing links: {}", links.size());
    }

    logger.debug("=============");
  }
}

代码示例来源:origin: biezhi/java-library-examples

@Override
  protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {

    if (statusCode != HttpStatus.SC_OK) {

      if (statusCode == HttpStatus.SC_NOT_FOUND) {
        logger.warn("Broken link: {}, this link was found in page: {}", webUrl.getURL(),
            webUrl.getParentUrl());
      } else {
        logger.warn("Non success status for link: {} status code: {}, description: ",
            webUrl.getURL(), statusCode, statusDescription);
      }
    }
  }
}

代码示例来源:origin: tim232385/WebVideoBot

public String getEmbedKey(WebURL webURL) {
  final Pattern EMBED_PATTERN = Pattern.compile("(\\/embed\\/)(.*)");
  if(!EMBED_PATTERN.matcher(webURL.getPath()).matches()){
    return "";
  } else {
    return EMBED_PATTERN.matcher(webURL.getPath()).replaceAll("$2");
  }
}

代码示例来源:origin: edu.uci.ics/crawler4j

@Override
  public void objectToEntry(WebURL url, TupleOutput output) {
    output.writeString(url.getURL());
    output.writeInt(url.getDocid());
    output.writeInt(url.getParentDocid());
    output.writeString(url.getParentUrl());
    output.writeShort(url.getDepth());
    output.writeByte(url.getPriority());
    output.writeString(url.getAnchor());
  }
}

代码示例来源:origin: stackoverflow.com

public void addSeed(String pageUrl, int docId) {
   String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
   if (canonicalUrl == null) {
     logger.error("Invalid seed URL: " + pageUrl);
     return;
   }
   if (docId < 0) {
     docId = docIdServer.getDocId(canonicalUrl);
     if (docId > 0) {
       // This URL is already seen.
       return;
     }
     docId = docIdServer.getNewDocID(canonicalUrl);
   } else {
     try {
       docIdServer.addUrlAndDocId(canonicalUrl, docId);
     } catch (Exception e) {
       logger.error("Could not add seed: " + e.getMessage());
     }
   }
   WebURL webUrl = new WebURL();
   webUrl.setURL(canonicalUrl);
   webUrl.setDocid(docId);
   webUrl.setDepth((short) 0);
   if (!robotstxtServer.allows(webUrl)) {
     logger.info("Robots.txt does not allow this seed: " + pageUrl);
   } else {
     frontier.schedule(webUrl); //method that adds URL to the frontier at run time
   }
 }

代码示例来源:origin: edu.uci.ics/crawler4j

@Override
public WebURL entryToObject(TupleInput input) {
  WebURL webURL = new WebURL();
  webURL.setURL(input.readString());
  webURL.setDocid(input.readInt());
  webURL.setParentDocid(input.readInt());
  webURL.setParentUrl(input.readString());
  webURL.setDepth(input.readShort());
  webURL.setPriority(input.readByte());
  webURL.setAnchor(input.readString());
  return webURL;
}

代码示例来源:origin: tim232385/WebVideoBot

@Override
protected WebURL handleUrlBeforeProcess(WebURL webURL) {
  return getViewkey(webURL)
      .map(key -> "https://www.pornhub.com/embed/" + key)
      .map(url -> {
        WebURL newUrl = new WebURL();
        newUrl.setURL(url);
        return newUrl;
      }).orElse(super.handleUrlBeforeProcess(webURL));
}

代码示例来源:origin: edu.uci.ics/crawler4j

String url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset);
if (url != null) {
  WebURL webURL = new WebURL();
  webURL.setURL(url);
  webURL.setTag(urlAnchorPair.getTag());
  webURL.setAnchor(urlAnchorPair.getAnchor());
  webURL.setAttributes(urlAnchorPair.getAttributes());
  outgoingUrls.add(webURL);
  urlCount++;

代码示例来源:origin: biezhi/java-library-examples

@Override
  public void visit(Page page) {
    int    docid       = page.getWebURL().getDocid();
    String url         = page.getWebURL().getURL();
    int    parentDocid = page.getWebURL().getParentDocid();

    logger.debug("Docid: {}", docid);
    logger.info("URL: {}", url);
    logger.debug("Docid of parent page: {}", parentDocid);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String        text          = htmlParseData.getText();
      String        html          = htmlParseData.getHtml();
      Set<WebURL>   links         = htmlParseData.getOutgoingUrls();

      logger.debug("Text length: {}", text.length());
      logger.debug("Html length: {}", html.length());
      logger.debug("Number of outgoing links: {}", links.size());
    }

    logger.debug("=============");
  }
}

相关文章