I am trying to extract meta-description content, from pages & display in search-results. But, its displaying it like :
content="Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus."
Whereas I only want:
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus.
Any guess, whats wrong in my code?
CODE :
$doc = new DOMDocument();
@$doc->loadHTMLFile($page_path);
$xpath = new DOMXPath($doc);
$body = $xpath->query('//meta[@name="description"]/@content');
$page_title = @$doc->getElementsByTagName('title')->item(0)->textContent;
$page_title = $page_title ? $page_title : $page_path;
$page_body = html2text($doc->saveXml($body->item(0)));// this is meta-description, which i want
Functions :
function html2text($html)
{
$text = $html;
static $search = array(
'@<script.+?</script>@usi', // Strip out javascript content
'@<style.+?</style>@usi', // Strip style content
'@<!--.+?-->@us', // Strip multi-line comments including CDATA
'@</?[a-z].*?\>@usi', // Strip out HTML tags
);
$text = preg_replace($search, ' ', $text);
/*
* normalize common entities
*/
$text = normalizeEntities($text);
/*
* decode other entities
*/
$text = html_entity_decode($text, ENT_QUOTES, 'utf-8');
/*
* normalize possibly repeated newlines, tabs, spaces to spaces
*/
$text = preg_replace('/\s+/u', ' ', $text);
$text = trim($text);
return $text;
}
/**
* Replace encoded and double encoded entities to equivalent unicode character
* @param string $text
* @return string - the same as $text but without encoded entries
* @access public
*/
function normalizeEntities($text)
{
static $find = array();
static $repl = array();
if (!count($find)) {
/*
* build $find and $replace from map one time
*/
$map = array(
array('\'', 'apos', 39, 'x27'), // Apostrophe
array('\'', '‘', 'lsquo', 8216, 'x2018'), // Open single quote
array('\'', '’', 'rsquo', 8217, 'x2019'), // Close single quote
array('"', '“', 'ldquo', 8220, 'x201C'), // Open double quotes
array('"', '”', 'rdquo', 8221, 'x201D'), // Close double quotes
array('\'', '‚', 'sbquo', 8218, 'x201A'), // Single low-9 quote
array('"', '„', 'bdquo', 8222, 'x201E'), // Double low-9 quote
array('\'', '′', 'prime', 8242, 'x2032'), // Prime/minutes/feet
array('"', '″', 'Prime', 8243, 'x2033'), // Double prime/seconds/inches
array(' ', 'nbsp', 160, 'xA0'), // Non-breaking space
array('-', '‐', 8208, 'x2010'), // Hyphen
array('-', '–', 'ndash', 8211, 150, 'x2013'), // En dash
array('--', '—', 'mdash', 8212, 151, 'x2014'), // Em dash
array(' ', ' ', 'ensp', 8194, 'x2002'), // En space
array(' ', ' ', 'emsp', 8195, 'x2003'), // Em space
array(' ', ' ', 'thinsp', 8201, 'x2009'), // Thin space
array('*', '•', 'bull', 8226, 'x2022'), // Bullet
array('*', '‣', 8227, 'x2023'), // Triangular bullet
array('...', '…', 'hellip', 8230, 'x2026'), // Horizontal ellipsis
array('°', 'deg', 176, 'xB0'), // Degree
array('€', 'euro', 8364, 'x20AC'), // Euro
array('¥', 'yen', 165, 'xA5'), // Yen
array('£', 'pound', 163, 'xA3'), // British Pound
array('©', 'copy', 169, 'xA9'), // Copyright Sign
array('®', 'reg', 174, 'xAE'), // Registered Sign
array('™', 'trade', 8482, 'x2122') // TM Sign
);
foreach ($map as $e) {
for ($i = 1; $i < count($e); ++$i) {
$code = $e[$i];
if (is_int($code)) {
// numeric entity
$regex = "/&(amp;)?#0*$code;/";
} elseif (preg_match('/^.$/u', $code)/* one unicode char*/) {
// single character
$regex = "/$code/u";
} elseif (preg_match('/^x([0-9A-F]{2}){1,2}$/i', $code)) {
// hex entity
$regex = "/&(amp;)?#x0*" . substr($code, 1) . ";/i";
} else {
// named entity
$regex = "/&(amp;)?$code;/";
}
$find[] = $regex;
$repl[] = $e[0];
}
}
}
return preg_replace($find, $repl, $text);
}