01-获取指定网站的所有的链接
需求就是获取指定网站内的所有的url,这个还是有难度的,因为需要一层一层递归遍历进去,然后各种判断提取,如果自己手撸的话会消耗很长的时间,好在java有现成的框架 来帮助我们快速实现。
crawler4j实现
依赖
xml
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>4.4.0</version>
</dependency>
代码实现
java
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.regex.Pattern;
public class MyCrawler extends WebCrawler {
// 过滤掉这些结尾的静态资源
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
+ "|png|mp3|mp4|zip|gz))$");
// 重写shouldVisit方法,决定哪些URL应该被访问
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches()
&& href.startsWith("https://blog.share888.top/");
}
// 重写visit方法,定义每次访问一个URL时的行为
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println(url);
}
}
测试代码
java
import ch.qos.logback.classic.Level;
import ch.qos.logback.classic.Logger;
import ch.qos.logback.classic.LoggerContext;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import org.junit.Test;
import org.slf4j.LoggerFactory;
import java.util.List;
public class Test {
static {
LoggerContext loggerContext = (LoggerContext) LoggerFactory.getILoggerFactory();
List<Logger> loggerList = loggerContext.getLoggerList();
loggerList.forEach(logger -> {
logger.setLevel(Level.INFO);
});
}
@Test
public void test() throws Exception {
// 定义数据存储的路径
String crawlStorageFolder = "/data/crawl";
// 定义要启动的爬虫数量
int numberOfCrawlers = 7;
// 创建CrawlConfig对象,并设置数据存储路径
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
// 创建PageFetcher对象,用于抓取网页
PageFetcher pageFetcher = new PageFetcher(config);
// 创建RobotstxtConfig和RobotstxtServer对象,用于处理robots.txt文件
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
// 创建CrawlController对象,用于控制爬虫的启动和管理
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
// 添加种子URL
controller.addSeed("https://blog.share888.top/");
// 启动爬虫
controller.start(MyCrawler.class, numberOfCrawlers);
}
}