edu.uci.ics.crawler4j.url.WebURL.getURL()方法的使用及代码示例

x33g5p2x  于2022-02-03 转载在 其他  
字(8.3k)|赞(0)|评价(0)|浏览(102)

本文整理了Java中edu.uci.ics.crawler4j.url.WebURL.getURL()方法的一些代码示例,展示了WebURL.getURL()的具体用法。这些代码示例主要来源于Github/Stackoverflow/Maven等平台,是从一些精选项目中提取出来的代码,具有较强的参考意义,能在一定程度帮忙到你。WebURL.getURL()方法的具体详情如下:
包路径:edu.uci.ics.crawler4j.url.WebURL
类名称:WebURL
方法名:getURL

WebURL.getURL介绍

暂无

代码示例

代码示例来源:origin: yasserg/crawler4j

/**
 * Emitted when the crawler is redirected to an invalid Location.
 * @param page
 */
protected void onRedirectedToInvalidUrl(Page page) {
  logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
    page.url.getURL());
}

代码示例来源:origin: yasserg/crawler4j

/**
 * This function is called if there has been an error in parsing the content.
 *
 * @param webUrl URL which failed on parsing
 */
@Deprecated
protected void onParseError(WebURL webUrl) {
  logger.warn("Parsing error of: {}", webUrl.getURL());
  // Do nothing by default (Except logging)
  // Sub-classed can override this to add their custom functionality
}

代码示例来源:origin: yasserg/crawler4j

/**
 * This function is called if the content of a url could not be fetched.
 *
 * @param webUrl URL which content failed to be fetched
 *
 * @deprecated use {@link #onContentFetchError(Page)}
 */
@Deprecated
protected void onContentFetchError(WebURL webUrl) {
  logger.warn("Can't fetch content of: {}", webUrl.getURL());
  // Do nothing by default (except basic logging)
  // Sub-classed can override this to add their custom functionality
}

代码示例来源:origin: yasserg/crawler4j

@Override
public boolean equals(Object o) {
  if (this == o) {
    return true;
  }
  if ((o == null) || (getClass() != o.getClass())) {
    return false;
  }
  WebURL otherUrl = (WebURL) o;
  return (url != null) && url.equals(otherUrl.getURL());
}

代码示例来源:origin: yasserg/crawler4j

@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
  String href = url.getURL().toLowerCase();
  return !FILE_ENDING_EXCLUSION_PATTERN.matcher(href).matches();
}

代码示例来源:origin: yasserg/crawler4j

/**
 * This function is called if the content of a url could not be fetched.
 *
 * @param page Partial page object
 */
protected void onContentFetchError(Page page) {
  logger.warn("Can't fetch content of: {}", page.getWebURL().getURL());
  // Do nothing by default (except basic logging)
  // Sub-classed can override this to add their custom functionality
}

代码示例来源:origin: yasserg/crawler4j

/**
 * This function is called when a unhandled exception was encountered during fetching
 *
 * @param webUrl URL where a unhandled exception occured
 */
protected void onUnhandledException(WebURL webUrl, Throwable e) {
  if (myController.getConfig().isHaltOnError() && !(e instanceof IOException)) {
    throw new RuntimeException("unhandled exception", e);
  } else {
    String urlStr = (webUrl == null ? "NULL" : webUrl.getURL());
    logger.warn("Unhandled exception while fetching {}: {}", urlStr, e.getMessage());
    logger.info("Stacktrace: ", e);
    // Do nothing by default (except basic logging)
    // Sub-classed can override this to add their custom functionality
  }
}

代码示例来源:origin: yasserg/crawler4j

public void setProcessed(WebURL webURL) {
  counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES);
  if (inProcessPages != null) {
    if (!inProcessPages.removeURL(webURL)) {
      logger.warn("Could not remove: {} from list of processed pages.", webURL.getURL());
    }
  }
}

代码示例来源:origin: yasserg/crawler4j

@Override
public void store(Page page) {
  if (page.getParseData() instanceof HtmlParseData) {
    try {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      insertKeyStatement.setString(1, htmlParseData.getHtml());
      insertKeyStatement.setString(2, htmlParseData.getText());
      insertKeyStatement.setString(3, page.getWebURL().getURL());
      insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
      insertKeyStatement.executeUpdate();
    } catch (SQLException e) {
      logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
      throw new RuntimeException(e);
    }
  }
}

代码示例来源:origin: yasserg/crawler4j

URL url = new URL(webURL.getURL());
  String host = getHost(url);
  String path = url.getPath();
  logger.error("Bad URL in Robots.txt: " + webURL.getURL(), e);
logger.warn("RobotstxtServer: default: allow", webURL.getURL());
return true;

代码示例来源:origin: yasserg/crawler4j

public boolean fetchContent(Page page, int maxBytes) throws SocketTimeoutException, IOException {
  try {
    page.setFetchResponseHeaders(responseHeaders);
    page.load(entity, maxBytes);
    return true;
  } catch (SocketTimeoutException e) {
    throw e;
  } catch (IOException | RuntimeException e) {
    if (haltOnError) {
      throw e;
    } else {
      logger.info("Exception while fetching content for: {} [{}]", page.getWebURL().getURL(),
            e.getMessage());
    }
  }
  return false;
}

代码示例来源:origin: yasserg/crawler4j

@Override
public void visit(Page page) {
  String url = page.getWebURL().getURL();
  logger.info("URL: " + url);
  if (page.getParseData() instanceof HtmlParseData) {
    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
    String text = htmlParseData.getText();
    String html = htmlParseData.getHtml();
    Set<WebURL> links = htmlParseData.getOutgoingUrls();
    logger.info("Text length: " + text.length());
    logger.info("Html length: " + html.length());
    logger.info("Number of outgoing links: " + links.size());
    try {
      postgresDBService.store(page);
    } catch (RuntimeException e) {
      logger.error("Storing failed", e);
    }
  }
}

代码示例来源:origin: yasserg/crawler4j

private Set<WebURL> parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException {
  Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());
  final String pagePath = referringPage.getPath();
  final String pageUrl = referringPage.getURL();
  Set<WebURL> outgoingUrls = new HashSet<>();
  for (String url : extractedUrls) {
    String relative = getLinkRelativeTo(pagePath, url);
    String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);
    WebURL webURL = new WebURL();
    webURL.setURL(absolute);
    outgoingUrls.add(webURL);
  }
  return outgoingUrls;
}

代码示例来源:origin: yasserg/crawler4j

String toFetchURL = webUrl.getURL();
HttpUriRequest request = null;
try {

代码示例来源:origin: yasserg/crawler4j

htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
  logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
  throw new ParseException("could not parse [" + page.getWebURL().getURL() + "]", e);
  logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
  throw new ParseException("could not parse [" + page.getWebURL().getURL() + "]", e);

代码示例来源:origin: yasserg/crawler4j

logger.debug(
          "Not visiting: {} as per the server's \"robots.txt\" policy",
          webURL.getURL());
             webURL.getURL());
             fetchResult.getEntity().getContentType() == null ? "" :
             fetchResult.getEntity().getContentType().getValue();
  onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(),
              contentType, description);
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
  if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
    logger.debug("Redirect page: {} has already been seen", curURL);
    "Warning: unknown page size exceeded max-download-size, truncated to: " +
    "({}), at URL: {}",
    myController.getConfig().getMaxDownloadSize(), curURL.getURL());
parser.parse(page, curURL.getURL());
  for (WebURL webURL : parseData.getOutgoingUrls()) {
    webURL.setParentDocid(curURL.getDocid());
    webURL.setParentUrl(curURL.getURL());
    int newdocid = docIdServer.getDocId(webURL.getURL());
    if (newdocid > 0) {
        if (shouldVisit(page, webURL)) {
          if (robotstxtServer.allows(webURL)) {

代码示例来源:origin: yasserg/crawler4j

@Override
  public void objectToEntry(WebURL url, TupleOutput output) {
    output.writeString(url.getURL());
    output.writeInt(url.getDocid());
    output.writeInt(url.getParentDocid());
    output.writeString(url.getParentUrl());
    output.writeShort(url.getDepth());
    output.writeByte(url.getPriority());
    output.writeString(url.getAnchor());
  }
}

代码示例来源:origin: yasserg/crawler4j

page.setParseData(parseData);
} catch (Exception e) {
  logger.error("{}, while parsing css: {}", e.getMessage(), page.getWebURL().getURL());
  throw new ParseException();
  page.setParseData(parseData);
} catch (Exception e) {
  logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
  throw new ParseException(e);

代码示例来源:origin: stackoverflow.com

public boolean shouldVisit(Page page, WebURL url) {
  String href = url.getURL().toLowerCase();
  // prefixes that you want to crawl
  String allowedPrefixes[] = {"http://url1.com", "http://url2.com"};

  for (String allowedPrefix : allowedPrefixes) {
    if (href.startsWith(allowedPrefix)) {
      return true;
    }
   }

  return false;
}

代码示例来源:origin: biezhi/java-library-examples

@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
  String href = url.getURL().toLowerCase();
  return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}

相关文章