本文实例讲述了PHP实现简单爬虫的方法。分享给大家供大家参考。具体如下:
<?php /** * 爬虫程序 -- 原型 * * 从给定的url获取html内容 * * @param string $url * @return string */ function _getUrlContent($url) { $handle = fopen($url, "r"); if ($handle) { $content = stream_get_contents($handle, 1024 * 1024); return $content; } else { return false; } } /** * 从html内容中筛选链接 * * @param string $web_content * @return array */ function _filterUrl($web_content) { $reg_tag_a = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
r
e
s
u
l
t
=
p
r
e
g
m
a
t
c
h
a
l
l
(
result = preg_match_all(
result=pregmatchall(reg_tag_a, $web_content,
m
a
t
c
h
r
e
s
u
l
t
)
;
i
f
(
match_result); if (
matchresult);if(result) {
return $match_result[1];
}
}
/**
- 修正相对路径
- @param string $base_url
- @param array $url_list
- @return array
/
function _reviseUrl($base_url, $url_list) {
u r l i n f o = p a r s e u r l ( url_info = parse_url( urlinfo=parseurl(base_url);
$base_url = u r l i n f o [ " s c h e m e " ] . ′ : / / ′ ; i f ( url_info["scheme"] . '://'; if ( urlinfo["scheme"].′://′;if(url_info[“user”] && $url_info[“pass”]) {
$base_url .= $url_info[“user”] . “:” . $url_info[“pass”] . “@”;
}
$base_url .= u r l i n f o [ " h o s t " ] ; i f ( url_info["host"]; if ( urlinfo["host"];if(url_info[“port”]) {
$base_url .= “:” . $url_info[“port”];
}
$base_url .= u r l i n f o [ " p a t h " ] ; p r i n t r ( url_info["path"]; print_r( urlinfo["path"];printr(base_url);
if (is_array(KaTeX parse error: Expected '}', got 'EOF' at end of input: …)) { foreach (url_list as $url_item) {
if (preg_match(‘/^http/’, $url_item)) {
// 已经是完整的url
$result[] = $url_item;
} else {
// 不完整的url
$real_url = $base_url . ‘/’ . $url_item;
$result[] = $real_url;
}
}
return $result;
} else {
return;
}
}
/* - 爬虫
- @param string $url
- @return array
/
function crawler($url) {
c o n t e n t = g e t U r l C o n t e n t ( content = _getUrlContent( content=getUrlContent(url);
if ($content) {
u r l l i s t = r e v i s e U r l ( url_list = _reviseUrl( urllist=reviseUrl(url, _filterUrl( c o n t e n t ) ) ; i f ( content)); if ( content));if(url_list) {
return $url_list;
} else {
return ;
}
} else {
return ;
}
}
/* - 测试用主程序
*/
function main() {
$current_url = “http://hao123.com/”; //初始url
$fp_puts = fopen(“url.txt”, “ab”); //记录url列表
$fp_gets = fopen(“url.txt”, “r”); //保存url列表
do {
r e s u l t u r l a r r = c r a w l e r ( result_url_arr = crawler( resulturlarr=crawler(current_url);
if (KaTeX parse error: Expected '}', got 'EOF' at end of input: … { foreach (result_url_arr as KaTeX parse error: Expected '}', got 'EOF' at end of input: …) { fputs(fp_puts, KaTeX parse error: Undefined control sequence: \n at position 10: url . "\r\̲n̲"); } } }…current_url = fgets($fp_gets, 1024)); //不断获得url
}
main();
?>