Property | Type | Description | |
---|---|---|---|
$articleContent | |||
$articleTitle | |||
$convertLinksToFootnotes | |||
$debug | no more used, keept to avoid BC | ||
$dom | |||
$lightClean | preserves more content (experimental) | ||
$original_html | |||
$regexps | Defined up here so we don't instantiate them repeatedly in loops. | ||
$revertForcedParagraphElements | |||
$tidied | |||
$tidy_config | |||
$url | optional - URL where HTML was retrieved |
Property | Type | Description | |
---|---|---|---|
$body | |||
$bodyCache | Cache the body HTML in case we need to re-use it later | ||
$domainRegExp | article domain regexp for calibration | ||
$flags | 1 | 2 | 4; // Start with all processing flags set. | ||
$html | |||
$logger | |||
$parser | |||
$post_filters | output HTML filters | ||
$pre_filters | raw HTML filters | ||
$success | indicates whether we were able to extract or not | ||
$useTidy |
Method | Description | |
---|---|---|
__construct ( $html, $url = null, $parser = 'libxml', $use_tidy = true ) | Create instance of Readability. | |
addFlag ( integer $flag ) | Add a flag. | |
addFootnotes ( DOMElement $articleContent ) | For easier reading, convert this document to have footnotes at the bottom rather than inline links. | |
addPostFilter ( $filter, $replacer = '' ) | Add post filter for raw output HTML processing. | |
addPreFilter ( $filter, $replacer = '' ) | Add pre filter for raw input HTML processing. | |
clean ( DOMElement $e, string $tag ) | Clean a node of all elements of type "tag". | |
cleanConditionally ( DOMElement $e, string $tag ) | Clean an element of all tags of type "tag" if they look fishy. | |
cleanHeaders ( DOMElement $e ) | Clean out spurious headers from an Element. Checks things like classnames and link density. | |
cleanStyles ( DOMElement $e ) | Remove the style attribute on every $e and under. | |
flagIsActive ( integer $flag ) : boolean | Check if the given flag is active. | |
getCommaCount ( string $text ) : integer | Get comma number for a given text. | |
getContent ( ) : DOMElement | Get article content element. | |
getInnerText ( DOMElement $e, boolean $normalizeSpaces = true, boolean $flattenLines = false ) : string | Get the inner text of a node. | |
getLinkDensity ( DOMElement $e, string $excludeExternal = false ) : integer | Get the density of links as a percentage of the content This is the amount of text that is inside a link divided by the total text in the node. | |
getTitle ( ) : DOMElement | Get article title element. | |
getWeight ( DOMElement $e ) : integer | Get an element relative weight. | |
getWordCount ( string $text ) : integer | Get words number for a given text if words separated by a space. | |
init ( ) : boolean | Runs readability. | |
killBreaks ( DOMElement $node ) | Remove extraneous break tags from a node. | |
postProcessContent ( DOMElement $articleContent ) | Run any post-process modifications to article content as necessary. | |
prepArticle ( DOMElement $articleContent ) |
Prepare the article node for display. Clean out any inline styles,
iframes, forms, strip extraneous tags, etc. |
|
removeFlag ( integer $flag ) | Remove a flag. | |
setLogger ( Psr\Log\LoggerInterface $logger ) |
Method | Description | |
---|---|---|
dbg ( $msg ) | Debug. | |
dump_dbg ( ) | Dump debug info. | |
getArticleTitle ( ) : DOMElement | Get the article title as an H1. | |
grabArticle ( DOMElement $page = null ) : DOMElement | boolean | Using a variety of metrics (content score, classname, element types), find the content that is most likely to be the stuff a user wants to read. Then return it wrapped up in a div. | |
initializeNode ( DOMElement $node ) | Initialize a node with the readability object. Also checks the className/id for special names to add to its score. | |
prepDocument ( ) | Prepare the HTML document for readability to scrape it. | |
reinitBody ( ) | Will recreate previously deleted body property. | |
weightAttribute ( DOMElement $element, string $attribute ) : integer | Get an element weight by attribute. |
Method | Description | |
---|---|---|
loadHtml ( ) | Load HTML in a DOMDocument. |
public __construct ( $html, $url = null, $parser = 'libxml', $use_tidy = true ) |
public addFootnotes ( DOMElement $articleContent ) | ||
$articleContent | DOMElement |
public addPostFilter ( $filter, $replacer = '' ) |
public addPreFilter ( $filter, $replacer = '' ) |
public cleanConditionally ( DOMElement $e, string $tag ) | ||
$e | DOMElement | |
$tag | string |
public cleanHeaders ( DOMElement $e ) | ||
$e | DOMElement |
public cleanStyles ( DOMElement $e ) | ||
$e | DOMElement |
public flagIsActive ( integer $flag ) : boolean | ||
$flag | integer | |
return | boolean |
protected getArticleTitle ( ) : DOMElement | ||
return | DOMElement |
public getCommaCount ( string $text ) : integer | ||
$text | string | |
return | integer |
public getContent ( ) : DOMElement | ||
return | DOMElement |
public getLinkDensity ( DOMElement $e, string $excludeExternal = false ) : integer | ||
$e | DOMElement | |
$excludeExternal | string | |
return | integer |
public getTitle ( ) : DOMElement | ||
return | DOMElement |
public getWordCount ( string $text ) : integer | ||
$text | string | |
return | integer |
protected grabArticle ( DOMElement $page = null ) : DOMElement | boolean | ||
$page | DOMElement | |
return | DOMElement | boolean |
protected initializeNode ( DOMElement $node ) | ||
$node | DOMElement |
public killBreaks ( DOMElement $node ) | ||
$node | DOMElement |
public postProcessContent ( DOMElement $articleContent ) | ||
$articleContent | DOMElement |
tags, etc.
public prepArticle ( DOMElement $articleContent ) | ||
$articleContent | DOMElement |
protected prepDocument ( ) |
protected reinitBody ( ) |
public setLogger ( Psr\Log\LoggerInterface $logger ) | ||
$logger | Psr\Log\LoggerInterface |
protected weightAttribute ( DOMElement $element, string $attribute ) : integer | ||
$element | DOMElement | |
$attribute | string | |
return | integer |
protected $bodyCache |
public $regexps |