Monday, May 18, 2009

UPDATE 20090516 : mingpao.php

Mingpao News has changed their layout, so there is a need to rewrite the parser.

Here is the code :

Code:

<?php

function InitCurl()
{
global $ch;

$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");
}

function CurlGetContent($url)
{
global $ch;

curl_setopt($ch, CURLOPT_URL, $url);

return curl_exec($ch);
}

function DisplayNewsPage($news)
{
global $base_url;
global $today;

$b = $base_url.$today.'/';
print '<table><tr><td width=200 VALIGN="top">';

if (is_array($news['PicSet']))
{
foreach ($news['PicSet'] as $k => $v)
{
print "<img src=\"".$b.$v['Pic']."\" onmouseover=\"popImg(true, this);\" onmouseout=\"popImg(false);\" width=180><BR>\r\n";
print $v['PicText']."<BR><BR>\r\n";
}
}
print '</td><td VALIGN="top">';
print '<H1>'.$news['Title'].'</H1><BR><BR>';
print $news['Body'];
print '</td></tr>';
print '</table>'."\r\n";
ob_flush();
flush();
}

function GetPic($url)
{
global $base_url;
global $today;


$page = CurlGetContent($base_url.$today.'/'.$url.'?Mode=1');

$pic = $pictext = "";

if (!preg_match_all('|mainphotolink"><img src="([^"]+)"|U', $page, $layer1, PREG_SET_ORDER)) return "";

$pic = $layer1[0][1];

if (preg_match_all("|<td align=\"center\" class=\"caption\">(.*?)</td>|U", $page, $layer2, PREG_SET_ORDER))
{
$pictext = $layer2[0][1];
}

return Array(
'Pic' => $pic,
'PicText' => $pictext,
);

}

function GetNewsContent($url)
{
global $base_url;
global $today;

$page = CurlGetContent($base_url.$today.'/'.$url);

$p = $body = $title = $pictext = $pic = "";

if (preg_match_all("|<h1>(.*?)</h1>|U", $page, $layer1, PREG_SET_ORDER))
{
$title = str_replace('<br>', ' ', $layer1[0][1]);
}

if (preg_match_all('|<div class="[^"]*" id="newscontent[\d]*">(.*?)</div>|is',$page, $layer2, PREG_SET_ORDER))
{
foreach ($layer2 as $v)
{
$body .= $v[1];
}
}

if (preg_match_all("|<a href=\"([^\"]+)\?Mode=1\">|U",$page, $layer3, PREG_SET_ORDER))
{
foreach ($layer3 as $v)
{
$ar = GetPic($v[1]);
if (is_array($ar)) $p[$v[1]] = $ar;
}

}

return Array(
'Title' => $title,
'Body' => $body,
'PicSet' => $p,
);

}

function ProcessAllTitles()
{
global $title_pages_set;
if (!is_array($title_pages_set))
{
echo 'ERR';
return;
}
foreach ($title_pages_set as $key => $val)
{
if (!strstr($val,"main"))
{
ProcessPage($key, $val);
}
}
}

function GetTitle($ming)
{
if (!preg_match_all("|sublink\[0\] = \'(.*?)\'|U",$ming, $layer1, PREG_SET_ORDER)) return "1";

if (!preg_match_all("|<a href=\"([^\"]+)\"[^>]*?>([^<]+)</a>|U",$layer1[0][1], $layer2, PREG_SET_ORDER)) return "2";

$titles = "";
foreach ($layer2 as $v)
{
if (!preg_match_all("|index.htm|U", $v[1], $layer3, PREG_SET_ORDER)) continue;
$title = $v[2];
$link = $v[1];
$titles[] = Array(
'Title' => $title,
'Link' => $link,
);
}
return $titles;

}

function GetHeadlines($url)
{
global $base_url;
global $today;

$page = CurlGetContent($base_url.$today.'/'.$url);
$heads = "";

if (!preg_match_all('#<(h1|li)>(.*?)</\1>#U', $page, $layer1, PREG_SET_ORDER)) return;
foreach ($layer1 as $v)
{
if (!preg_match_all("|<a href=\"([^\"]+)\">(.*?)</a>|U", $v[2], $layer2, PREG_SET_ORDER)) continue;
$title = str_replace('<br>','&nbsp;&nbsp;&nbsp;&nbsp;',trim($layer2[0][2]));
$link = $layer2[0][1];
$heads[$link] = Array(
'Title' => $title,
'Link' => $link,
);
}
return $heads;
}

function DisplayHeader($heads, $name)
{
print '<FORM action='.$_SERVER['PHP_SELF'].' method=POST target=_blank>';
print '<table><tr>';

print '<td width=100 valign=top>'.$name ."<BR>\n";

print '<INPUT type=SUBMIT value=Submit><INPUT type=RESET>';
print '<input type=BUTTON onclick="selectAll(this.form);" value=SelectAll>';

print '<INPUT type=HIDDEN name=ShowNews value=1>';
print "<BR>\r\n</td><td valign=top>";
foreach ($heads as $v)
{
print "<INPUT type=CHECKBOX name=".$v['Link']." value=".$v['Link'].">";
print "<a href=".$_SERVER['PHP_SELF']."?ShowNews=1&".$v['Link']."=".$v['Link']." target=_blank>".$v['Title']."</a><BR>\n";
}
print '</td></tr></table>';
print '</FORM>'."\r\n";
}

function PrintJavaScript()
{
?>

<script type='text/javascript'>
function get(eid)
{
var d = document;
var r = d.getElementById(eid);
return r;
}
function popImg(open, iref)
{
if (open)
{
var top = (iref.offsetParent.offsetParent.offsetTop + iref.offsetTop) + 'px';
var curleft = 0;
var obj = iref;
do {
curleft += obj.offsetLeft;
} while (obj = obj.offsetParent);
var left = (curleft + iref.offsetWidth )+ 'px';
var img = '<img src="' + iref.src + '" />';
var d = document;
if (null == get('popImg'))
{
var pop = d.createElement('DIV');
pop.id = 'popImg';
pop.style.position = 'absolute';
d.body.appendChild(pop);
}
var pop = get('popImg');
pop.innerHTML = img;
pop.style.top = top;
pop.style.left = left;
pop.style.display = 'block';
}
else
{
var pop = get('popImg');
pop.style.display = 'none';
}
}
function selectAll(formObj)
{
for (var i=0;i < formObj.length;i++)
{
fldObj = formObj.elements[i];
if (fldObj.type == 'checkbox')
{
fldObj.checked = true;
}
}
}

</script>

<?PHP

}

function GetToday($ming)
{
global $today;
global $base_url;
global $main_url;
if (!preg_match_all('|<base href="http://news.mingpao.com/(\d+)/">|U',$ming, $layer1, PREG_SET_ORDER))
{
return "";
}
return $layer1[0][1];

}

function PrepareGlobals()
{
global $HTTP_HEADER;
global $HTTP_FOOTER;

global $today;
global $base_url;
global $main_url;

InitCurl();

$HTTP_HEADER = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=Big5-HKSCS"><title>My MingPaoNews</title>';
$HTTP_HEADER.= '<style type="text/css"> ';
$HTTP_HEADER.= 'a:link { color: #0000FF; text-decoration: none; } ';
$HTTP_HEADER.= 'a:active { color: #000088; text-decoration: underline; } ';
$HTTP_HEADER.= 'a:visited { color: #000088; text-decoration: none; } ';
$HTTP_HEADER.= 'a:hover { color: #0055FF; text-decoration: underline; } ';
$HTTP_HEADER.= 'table {width: 100% ; border-width:1px; border-collapse: collapse; border-color:#003333; border-style:dashed} ';
$HTTP_HEADER.= 'td {padding: 3px;} ';
$HTTP_HEADER.= '</style>';
$HTTP_HEADER.= '</head><body>'."\r\n";

$HTTP_FOOTER = "\r\n".'</body></html>'."\r\n" ;

$base_url = "http://news.mingpao.com/";
$main_url = $base_url.'index.htm';
$today = GetToday(CurlGetContent($main_url));
}

function DoShowNews()
{
global $HTTP_HEADER;
global $HTTP_FOOTER;

print $HTTP_HEADER;
PrintJavaScript();

foreach($_REQUEST as $k => $v)
{
if ($k == "ShowNews") continue;
$news = GetNewsContent($v);
if (is_array($news)) DisplayNewsPage($news);
}

print $HTTP_FOOTER;
}

function DoIndex()
{
global $HTTP_HEADER;
global $HTTP_FOOTER;
global $main_url;

print $HTTP_HEADER;

PrintJavaScript();

$ming = CurlGetContent($main_url);
$titles = GetTitle($ming);
if (!is_array($titles))
{
print "ERR Titles";
print $HTTP_FOOTER;
return;
}
foreach($titles as $v)
{
$head = GetHeadlines($v['Link']);
if (is_array($head)) DisplayHeader($head, $v['Title']);
}

print $HTTP_FOOTER;

}

// MAIN HERE



PrepareGlobals();

if (isset($_REQUEST['ShowNews']))
{
DoShowNews();
}
else
{
DoIndex();
}

?>