Property | Type | Description | |
---|---|---|---|
$baseUrl | string | The base URL from which the crawler begins crawling | |
$links | array | Array of links (and related data) found by the crawler | |
$maxDepth | integer | The max depth the crawler will crawl |
Method | Description | |
---|---|---|
__construct ( string $baseUrl, integer $maxDepth = 3 ) | Constructor | |
getLinks ( ) : array | Get links (and related data) found by the crawler | |
traverse ( string $url = null ) | Initiate the crawl |
Method | Description | |
---|---|---|
checkIfCrawlable ( string $uri ) : boolean | Is a given URL crawlable? | |
checkIfExternal ( string $url ) : boolean | Is URL external? | |
extractLinksInfo ( |
Extract links information from url | |
extractTitleInfo ( |
Extract title information from url | |
getPathFromUrl ( type $url ) : type | extrating the relative path from url string | |
getScrapClient ( ) : |
create and configure goutte client used for scraping | |
normalizeLink ( $uri ) : string | Normalize link (remove hash, etc.) | |
traverseChildren ( array $childLinks, integer $depth ) | Crawl child links | |
traverseSingle ( string $url, integer $depth ) | Crawl single URL |
public __construct ( string $baseUrl, integer $maxDepth = 3 ) | ||
$baseUrl | string | |
$maxDepth | integer |
protected checkIfCrawlable ( string $uri ) : boolean | ||
$uri | string | |
return | boolean |
protected checkIfExternal ( string $url ) : boolean | ||
$url | string | An absolute URL (with scheme) |
return | boolean |
protected extractLinksInfo ( |
||
$crawler | ||
$url | string | |
return | array |
protected extractTitleInfo ( |
||
$crawler | ||
$url | string |
protected getPathFromUrl ( type $url ) : type | ||
$url | type | |
return | type |
protected getScrapClient ( ) : |
||
return |
protected normalizeLink ( $uri ) : string | ||
return | string |
protected traverseChildren ( array $childLinks, integer $depth ) | ||
$childLinks | array | |
$depth | integer |
protected traverseSingle ( string $url, integer $depth ) | ||
$url | string | |
$depth | integer |
protected string $baseUrl | ||
return | string |
protected array $links | ||
return | array |
protected int $maxDepth | ||
return | integer |