Sunday, April 27, 2008

My MingPaoNews

記得大學Year 3時曾上過一個course,名為Information Retrieval,學會編寫網上蜘蛛,因忙著做Final Year Project而沒有放心血下這course的project上,回必一想真是感到有點可惜。

兩三年前,當我還在玩PDA的時候,有網友編了個exe,功能是download 即日Apple Daily同明報,再用ebook工具做成電子書,加上定時功能放到PDA上,那麼每天上班也有新聞可讀。可惜這種program要時常update,網上的報紙要是一改版,program即時報廢。當時心想,不如自己寫一個... 幾年也沒有做出來...

今日,不知為什麼一起床就想去做這個東西,寫了一整天,完成了整個php script。

工作:
到www.mingpaonews.com取得所有headline,
用家選擇若干篇興趣的文章後,再submit即可一次過閱讀所選的文章。

功能:
自動設charset為Big5-HKSCS,不用每頁也要手動轉;
文章的附圖也會貼上,onMouseOver可放大。

限制:
娛樂、財經及賽馬的排版是獨立的,沒有處理它們。

Code:


<?php

function DisplayNewsPage($title,$body,$pic,$pictext)
{
global $base_url;

print '<table border=1><tr><td width=200 VALIGN="top">';
if (is_array($pic))
{
foreach ($pic as $key => $val)
{
print "<img src=\"".$base_url.$val."\" onmouseover=\"popImg(true, this);\" onmouseout=\"popImg(false);\" width=180>"."<BR>\r\n";
print $pictext[$key]."<BR><BR>\r\n";
}
}
print '</td><td VALIGN="top">';
print '<H1>'.$title.'</H1><BR><BR>';
print $body;
print '</td></tr>';
print '</table>';

}

function ProcessNewsPage($url)
{
global $base_url;

$main_content = file_get_contents($base_url.$url);
$pic = "";
$pictext = "";
preg_match_all('|<font size="4" class="txt150" color="#006666">(.*)</font>|U',$main_content, $layer1, PREG_SET_ORDER);

$title = $layer1[0][1];

if (preg_match_all('|<input type="hidden" name="content" value="([^"]*)"|is',$main_content, $layer2, PREG_SET_ORDER))
{
$body = $layer2[0][1];
}

if (preg_match_all("|<font size=2 class='txt150'>(.*)\n.*;javascript:viewlargephoto\('([^']*)'\);|U",$main_content, $layer3, PREG_SET_ORDER))
{
foreach ($layer3 as $val)
{
$pic[] = $val[2];
$pictext[] = $val[1];
}
}

DisplayNewsPage($title,$body,$pic,$pictext);
}

function ProcessPage($name,$url)
{
global $title_keys;
global $base_url;

$main_content = file_get_contents($url);
preg_match_all("|<a[^>]*>(.*)</a>|U",$main_content, $layer1, PREG_SET_ORDER);
foreach($layer1 as $val)
{
if (strstr($val[0],'http')) continue;
if (strstr($val[0],'mailto')) continue;
if (strstr($val[0],'href="#')) continue;

$found = 0;
foreach ($title_keys as $k)
{
if (strstr($val[0],$k))
{
$found = 1;
break;
}
}
if ($found == 1) continue;
if (preg_match_all('|href="([^"]*)"|U',$val[0], $layer2, PREG_SET_ORDER))
{
$a = $layer2[0][1];
if (strstr($val[1],"..."))
{
@$pages[$a]="";
}
else if (@strstr($pages[$a],trim($val[1])))
{
}
else
{
@$pages[$a].=$val[1];
}
}
}

print '<FORM action='.$_SERVER['PHP_SELF'].' method=POST target=_new>';
print '<table border=1><tr>';

print '<td width=100 valign=top>'.$name ."<BR>\n";

print '<INPUT type=SUBMIT value=Submit><INPUT type=RESET>';
print '<INPUT type=HIDDEN name=ShowNews value=1>';
print "<BR>\r\n</td><td valign=top>";
foreach ($pages as $key => $val)
{
print "<INPUT type=CHECKBOX name=$key value=$key>";
print "<a href=".$_SERVER['PHP_SELF']."?ShowNews=1&$key=$key target=_new>$val</a><BR>\n";
}
print '</td></tr></table>';
print '</FORM>';

}

function ProcessAllTitles()
{
global $title_pages_set;
if (!is_array($title_pages_set))
{
echo 'pass 12:00';
return;
}
foreach ($title_pages_set as $key => $val)
{
if (!strstr($val,"main"))
{
ProcessPage($key, $val);
}
}
}

function PrepareTitles()
{
global $title_keys;
global $title_pages_set;
global $today;
global $base_url;
global $main_url;

$title_tag = '<font color="FFFFFF">';
$main_content = file_get_contents($main_url);
$title_pages_set = "";

preg_match_all("|<a[^>]*>(.*)</a>|U",$main_content, $layer1, PREG_SET_ORDER);
foreach($layer1 as $val)
{
if (preg_match_all("|".$title_tag."|U",$val[0], $layer2, PREG_SET_ORDER))
{
$title_pages[$val[0]]=$val[1];
}
}

$removal = array('<font color="FFFFFF">', '</font>');
foreach ($title_pages as $key => $val)
{
$key = preg_replace('| target="[^"]*"|U','',$key);
$new_key = str_replace($removal, "", $key);
$new_val = str_replace($removal, "", $val);

preg_match_all('|href="([^"]*)"|U',$new_key, $layer3, PREG_SET_ORDER);
if (!strstr($layer3[0][1],"http"))
{
$title_keys[] = $layer3[0][1];
$title_pages_set[$new_val] = $base_url.$layer3[0][1];
}
}
}



function PrintJavaScript()
{
print '<script type=\'text/javascript\'>';
print 'function get(eid)';
print '{';
print ' var d = document;';
print ' var r = d.getElementById(eid);';
print ' return r;';
print '}';
print 'function popImg(open, iref)';
print '{';
print ' if (open)';
print ' {';
print ' var top = (iref.offsetParent.offsetParent.offsetTop + iref.offsetTop) + \'px\';';
print ' var curleft = 0;';
print ' var obj = iref;';
print ' do {';
print ' curleft += obj.offsetLeft;';
print ' } while (obj = obj.offsetParent);';
print ' var left = (curleft + iref.offsetWidth )+ \'px\';';
print ' var img = \'<img src="\' + iref.src + \'" />\';';
print ' var d = document;';
print ' if (null == get(\'popImg\'))';
print ' {';
print ' var pop = d.createElement(\'DIV\');';
print ' pop.id = \'popImg\';';
print ' pop.style.position = \'absolute\';';
print ' d.body.appendChild(pop);';
print ' }';
print ' var pop = get(\'popImg\');';
print ' pop.innerHTML = img;';
print ' pop.style.top = top;';
print ' pop.style.left = left;';
print ' pop.style.display = \'block\';';
print ' }';
print ' else';
print ' {';
print ' var pop = get(\'popImg\');';
print ' pop.style.display = \'none\';';
print ' }';
print '}';
print '</script>';
}

function PrepareGlobals()
{
global $HTTP_HEADER;
global $HTTP_FOOTER;

global $today;
global $base_url;
global $main_url;

$HTTP_HEADER = '<html><header><meta http-equiv="Content-Type" content="text/html; charset=Big5-HKSCS"><title>My MingPaoNews</title></header><body>'."\r\n";
$HTTP_FOOTER = "\r\n".'</body></html>'."\r\n" ;

$today = date("Ymd");
$base_url = "http://www.mingpaonews.com/".$today."/";
$main_url = $base_url."main.htm";
}

function DoShowNews()
{
global $HTTP_HEADER;
global $HTTP_FOOTER;

print $HTTP_HEADER;
PrintJavaScript();

foreach($_REQUEST as $key => $value)
{
if ($key == "ShowNews") continue;
ProcessNewsPage($value);
}

print $HTTP_FOOTER;
}

function DoIndex()
{
global $HTTP_HEADER;
global $HTTP_FOOTER;

print $HTTP_HEADER;

PrepareTitles();

ProcessAllTitles();

print $HTTP_FOOTER;

}

// MAIN HERE



PrepareGlobals();

if (isset($_REQUEST['ShowNews']))
{
DoShowNews();
}
else
{
DoIndex();
}


?>

No comments: