基于Ablot2的强大c#爬虫框架,能够渲染js、web assembly。
AbotEdge基于Abot2,因此继承了其强大的可扩展性,和极其方便的使用方法。如果只需要基础功能,您可以通过监听事件轻松使用它。而如果您想要自定义,我们为所有的类提供了接口,您可以自行实现它们中的一个或者多个以自定义它们的行为。
在这儿开放源码
非常高速
易扩展
多重测试 (极高的稳定性)(你们可能无法想象,原作者给Abot2写了300多个单元测试。。这一点AbotEdge当然很好的继承了)
极其轻量 (not over engineered)
依赖项少 (不需要数据库, 不需要额外安装服务, etc...如果你不使用js和wasm渲染的话)
退出自动保存状态,下次使用您可以自主选择是否继续上次的工作
通过config轻松定义行为
git clone ssh://git@59.175.109.42:23900/Alice/abot_edge.git
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
using System.Xml.Serialization;
using Abot2.Core;
using Abot2.Crawler;
using Abot2.Poco;
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Io;
using AngleSharp.Text;
using Newtonsoft.Json;
using Serilog;
using Serilog.Formatting.Json;
namespace Abot2.Demo
{
public class Program
{
static List<string> contents = new List<string>();
static AngleSharpHyperlinkParser parser = new AngleSharpHyperlinkParser();
static List<Uri> uris = new List<Uri>();
static int i = 0;
static PageRequester requester;
public static async Task Main(string[] args)
{
Log.Logger = new LoggerConfiguration()
.MinimumLevel.Information()
.Enrich.WithThreadId()
.WriteTo.Console(outputTemplate: Constants.LogFormatTemplate)
.CreateLogger();
Log.Information("Demo starting up!");
await DemoSimpleCrawler();
Log.Information("Demo done!");
foreach (var item in uris)
{
Console.WriteLine(item.AbsoluteUri);
}
Console.ReadKey();
}
private static async Task DemoSimpleCrawler()
{
var config = new CrawlConfiguration
{
MaxPagesToCrawl = 1000,
MinCrawlDelayPerDomainMilliSeconds = 1000,
MaxCrawlDepth = 10,
MaxConcurrentThreads = 100,
//AddUrlWhen = page => !(string.IsNullOrEmpty(page.Uri.PathAndQuery) | page.Uri.PathAndQuery == "/"),
RangeFirst = false,//设置为深度优先
RenderJsAndAsm = true //渲染js以及asm(此功能需要电脑安装crome和chrome driver,见下方tips)
};
requester = new PageRequester(config, new WebContentExtractor());
var crawler = new PoliteWebCrawler(config);
crawler.PageCrawlCompleted += Crawler_PageCrawlLimfxCompleted;
var token = new CancellationTokenSource();
//var crawlresult = crawler.ResumeAsync();
var crawlTask = crawler.CrawlAsync(new Uri("https://limfx.xyz"), token);//此网站是vue spa网站,一般的爬虫无法抓取其内容,只有能渲染js的爬虫才可以抓取
//Console.WriteLine($"Task ended!{crawlTask.CrawlContext.CrawledCount} pages were crawled, with {crawlTask.Elapsed} nilisec\n\n");
Console.ReadKey();
if (crawlTask.IsCompleted)
{
Log.Information($"crawled {crawlTask.Result.CrawlContext.CrawledCount} pages");
}
token.Cancel();//终止爬取,自动保存
crawler.Dispose();
}
private static void Crawler_PageCrawlLimfxCompleted(object sender, PageCrawlCompletedArgs e)
{
var httpStatus = HttpStatusCode.OK;
try
{
httpStatus = e.CrawledPage.HttpResponseMessage.StatusCode;
}
catch (Exception)
{
return;
}
var text = e.CrawledPage.Content.Text;
if (text.Contains("脱碳甲醛"))
{
uris.Add(e.CrawledPage.Uri);
}
}
}
}
tips:
若使用js、asm渲染,电脑应当安装chrome浏览器,并去chrome driver下载正确版本的driver,确保C:\Program Files\路径下存在chromedriver.exe。现在,只需确保项目引用了nuget包即可
Abot 被设计为尽可能的可扩展。这允许您轻松更改其工作方式,以满足您的需求。
CrawlDecision Callbacks/Delegates
在WebCrawler/PoliteWebCrawler类中,提供了一系列的委托,允许您通过它们快速定义哪些网站应当被忽略,哪些网站应当不被下载etc..
var crawler = new PoliteWebCrawler();
crawler.ShouldCrawlPageDecisionMaker((pageToCrawl, crawlContext) =>
{
var decision = new CrawlDecision{ Allow = true };
if(pageToCrawl.Uri.Authority == "google.com")
return new CrawlDecision{ Allow = false, Reason = "Dont want to crawl google pages" };
return decision;
});
crawler.ShouldDownloadPageContentDecisionMaker((crawledPage, crawlContext) =>
{
var decision = new CrawlDecision{ Allow = true };
if (!crawledPage.Uri.AbsoluteUri.Contains(".com"))
return new CrawlDecision { Allow = false, Reason = "Only download raw page content for .com tlds" };
return decision;
});
crawler.ShouldCrawlPageLinksDecisionMaker((crawledPage, crawlContext) =>
{
var decision = new CrawlDecision{ Allow = true };
if (crawledPage.Content.Bytes.Length < 100)
return new CrawlDecision { Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes" };
return decision;
});自定义实现
PoliteWebCrawler是协调爬网的主体。它的工作是协调所有实用程序类来爬取网站点。PoliteWebCrawler 可以通过其构造函数接受其所有依赖项的备用实现。
var crawler = new PoliteWebCrawler(
new CrawlConfiguration(),
new YourCrawlDecisionMaker(),
new YourThreadMgr(),
new YourScheduler(),
new YourPageRequester(),
new YourHyperLinkParser(),
new YourMemoryManager(),
new YourDomainRateLimiter(),
new YourRobotsDotTextFinder());向任何参数传递 null 都将使用默认值。下面的示例将使用客户 IPageRequester 和 IHyperLinkParser 的自定义实现,但对其它类将使用默认实现。
var crawler = new PoliteWebCrawler(
null,
null,
null,
null,
new YourPageRequester(),
new YourHyperLinkParser(),
null,
null,
null);以下是 PoliteWebCrawler 赖以执行实际工作的每个接口的说明。
ICrawlDecisionMaker
上文描述的 callback/delegate 模式非常适合快速编辑,但是如果您需要进一步自定义决策功能,则建议您自信实现 ICrawlDecisionMaker 接口的实例
CrawlDecisionMaker.cs 是AbotEdge中decisionmacker的默认实现
/// <summary>
/// Determines what pages should be crawled, whether the raw content should be downloaded and if the links on a page should be crawled
/// </summary>
public interface ICrawlDecisionMaker
{
/// <summary>
/// Decides whether the page should be crawled
/// </summary>
CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext);
/// <summary>
/// Decides whether the page's links should be crawled
/// </summary>
CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext);
/// <summary>
/// Decides whether the page's content should be dowloaded
/// </summary>
CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext);
}IThreadManager
IThreadManager 接口用于管理多线程
TaskThreadManager.cs 是该接口的默认实现.
/// <summary>
/// Handles the multithreading implementation details
/// </summary>
public interface IThreadManager : IDisposable
{
/// <summary>
/// Max number of threads to use.
/// </summary>
int MaxThreads { get; }
/// <summary>
/// Will perform the action asynchrously on a seperate thread
/// </summary>
/// <param name="action">The action to perform</param>
void DoWork(Action action);
/// <summary>
/// Whether there are running threads
/// </summary>
bool HasRunningThreads();
/// <summary>
/// Abort all running threads
/// </summary>
void AbortAll();
}IScheduler
IScheduler 接口用于管理需要爬取的页面,它接收爬虫新获得的链接并提供给爬虫需要爬取的新页面。
Scheduler.cs 是该接口的默认实现。
/// <summary>
/// Handles managing the priority of what pages need to be crawled
/// </summary>
public interface IScheduler
{
/// <summary>
/// Count of remaining items that are currently scheduled
/// </summary>
int Count { get; }
/// <summary>
/// Schedules the param to be crawled
/// </summary>
void Add(PageToCrawl page);
/// <summary>
/// Schedules the param to be crawled
/// </summary>
void Add(IEnumerable<PageToCrawl> pages);
/// <summary>
/// Gets the next page to crawl
/// </summary>
PageToCrawl GetNext();
/// <summary>
/// Clear all currently scheduled pages
/// </summary>
void Clear();
Task SaveAsync();
void Resume();
}IPageRequester
IPageRequester i接口用来发送http请求
PageRequester.cs 是它的默认实现。
public interface IPageRequester
{
/// <summary>
/// Make an http web request to the url and download its content
/// </summary>
CrawledPage MakeRequest(Uri uri);
/// <summary>
/// Make an http web request to the url and download its content based on the param func decision
/// </summary>
CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent);
}IHyperLinkParser
IHyperLinkParser 接口用于从html中提取链接
HapHyperlinkParser.cs 是该接口的默认实现. 它使用知名的c#类库 Html Agility Pack. 我们也提供了使用AngleSharp的另一实现 [AngleSharpHyperLinkParser.cs]。AngleSharp 使用类似jQuery的css选择器,但是是全c#的。
/// <summary>
/// Handles parsing hyperlikns out of the raw html
/// </summary>
public interface IHyperLinkParser
{
/// <summary>
/// Parses html to extract hyperlinks, converts each into an absolute url
/// </summary>
IEnumerable<Uri> GetLinks(CrawledPage crawledPage);
}IMemoryManager
IMemoryManager 用于内存监控。此功能在实验期。若无法证实其可靠性未来可能会移除。
MemoryManager.cs 是该接口的默认实现。
/// <summary>
/// Handles memory monitoring/usage
/// </summary>
public interface IMemoryManager : IMemoryMonitor, IDisposable
{
/// <summary>
/// Whether the current process that is hosting this instance is allocated/using above the param value of memory in mb
/// </summary>
bool IsCurrentUsageAbove(int sizeInMb);
/// <summary>
/// Whether there is at least the param value of available memory in mb
/// </summary>
bool IsSpaceAvailable(int sizeInMb);
}IDomainRateLimiter
IDomainRateLimiter 管理域名访问限制。防止访问过于频繁ip被封。
DomainRateLimiter.cs 是它的默认实现
/// <summary>
/// Rate limits or throttles on a per domain basis
/// </summary>
public interface IDomainRateLimiter
{
/// <summary>
/// If the domain of the param has been flagged for rate limiting, it will be rate limited according to the configured minimum crawl delay
/// </summary>
void RateLimit(Uri uri);
/// <summary>
/// Add a domain entry so that domain may be rate limited according the the param minumum crawl delay
/// </summary>
void AddDomain(Uri uri, long minCrawlDelayInMillisecs);
}IRobotsDotTextFinder
IRobotsDotTextFinder 负责检索每个域的 robots.txt 文件。
RobotsDotTextFinder.cs 是它的默认实现
/// <summary>
/// Finds and builds the robots.txt file abstraction
/// </summary>
public interface IRobotsDotTextFinder
{
/// <summary>
/// Finds the robots.txt file using the rootUri.
///
IRobotsDotText Find(Uri rootUri);
}