Discuz教程网

PHP 论坛采集程序 模拟登陆,抓取页面 实现代码

[复制链接]
authicon dly 发表于 2011-9-3 19:36:59 | 显示全部楼层 |阅读模式
代码如下:

  1. <?php
  2. // 吴燕军
  3. // 2009-06-27
  4. // 采集程序php
  5. set_time_limit(0);
  6. //cookie保存目录
  7. $cookie_jar = '/tmp/cookie.tmp';
  8. /*函数------------------------------------------------------------------------------------------------------------*/
  9. //模拟请求数据
  10. function request($url,$postfields,$cookie_jar,$referer){
  11. $ch = curl_init();
  12. $options = array(CURLOPT_URL => $url,
  13. CURLOPT_HEADER => 0,
  14. CURLOPT_NOBODY => 0,
  15. CURLOPT_PORT => 80,
  16. CURLOPT_POST => 1,
  17. CURLOPT_POSTFIELDS => $postfields,
  18. CURLOPT_RETURNTRANSFER => 1,
  19. CURLOPT_FOLLOWLOCATION => 1,
  20. CURLOPT_COOKIEJAR => $cookie_jar,
  21. CURLOPT_COOKIEFILE => $cookie_jar,
  22. CURLOPT_REFERER => $referer
  23. );
  24. curl_setopt_array($ch, $options);
  25. $code = curl_exec($ch);
  26. curl_close($ch);
  27. return $code;
  28. }
  29. //获取帖子列表
  30. function getThreadsList($code){
  31. preg_match_all('/ <!--[.|\r|\n]*? <a href="viewthread.php\?tid=(\d+)/',$code,$threads);
  32. return $threads[1];
  33. }
  34. //判断该帖子是否存在
  35. function isExits($code){
  36. preg_match('/ <p>指定的主题不存在或已被删除或正在被审核,请返回。 <\/p>/',$code,$error);
  37. return isset($error[0])?false:true;
  38. }
  39. //获取帖子标题
  40. function getTitle($code){
  41. preg_match('/ <h1>[^ <\/h1>]*/',$code,$title_tmp);
  42. $title = $title_tmp[0];
  43. return $title;
  44. }
  45. //获取帖子作者:
  46. function getAuthor($code){
  47. preg_match('/ <a href="space.php\?uid=\d+" target="_blank" id="userinfo\d+" onmouseover="showMenu\(this\.id\)">.+/',$code,$author_tmp);
  48. $author = strip_tags($author_tmp[0]);
  49. return $author;
  50. }
  51. //获取楼主发表的内容
  52. function getContents($code){
  53. preg_match('/ <div id="postmessage_\d+" class="t_msgfont">(.|\r|\n)*? <\/div>/',$code,$contents_tmp);
  54. $contents = preg_replace('/images\//','http://bbs.war3.cn/images/',$contents_tmp[0]);
  55. return $contents;
  56. }
  57. //打印帖子标题
  58. function printTitle($title){
  59. echo " <strong> <h2>帖子标题: </h2> </strong>",strip_tags($title)," <br/> <br/>";
  60. }
  61. //输出帖子作者
  62. function printAuthor($author){
  63. echo " <strong> <h2>帖子作者: </h2> </strong>",strip_tags($author)," <br/> <br/>";
  64. }
  65. //打印帖子内容
  66. function printContents($contents){
  67. echo " <strong> <h2>作者发表的内容: </h2>",$contents," </strong> <br/>";
  68. }
  69. //错误
  70. function printError(){
  71. echo " <i>该帖子不存在! </i>";
  72. }
  73. /*函数列表end---------------------------------------------------------------------------------------------------*/
  74. /*登录论坛 begin*/
  75. $url = 'http://bbs.war3.cn/logging.php?action=login';
  76. $postfields='loginfield=username&username=1nject10n& password=xxxxxx&questionid=0&cookietime=315360000& referer=http://bbs.war3.cn/&loginsubmit=提交';
  77. request($url,$postfields,$cookie_jar,'');
  78. unset($postfields,$url);
  79. /*登录论坛 end*/
  80. /*获取帖子列表(位于第一页的帖子) begin*/
  81. $url = 'http://bbs.war3.cn/forumdisplay.php?fid=57';
  82. $code = request($url,'',$cookie_jar,'');
  83. $threadsList = getThreadsList($code);
  84. /*获取帖子列表 end*/
  85. //帖子序列
  86. $rows = 0;
  87. /*循环抓取所有帖子源代码 begin*/
  88. foreach($threadsList as $list){
  89. $url = "http://bbs.war3.cn/viewthread.php?tid=$list";
  90. if(isExits($code)){
  91. $code = request($url,'',$cookie_jar,'');
  92. $color = $rows%2==0?'#00CCFF':'#FFFF33';
  93. echo " <div style='background-color:$color'>";
  94. echo " <h1>第",($rows+1),"贴: </h1> <br/>";
  95. $author = getAuthor($code);
  96. printAuthor($author);
  97. $title = getTitle($code);
  98. printTitle($title);
  99. $contents = getContents($code);
  100. printContents($contents);
  101. echo " </div>";
  102. $rows++;
  103. }
  104. else
  105. printError();
  106. echo "----------------------------------------------------------------------------------------- <br/> <br/>";
  107. }
  108. /*抓取源代码 end*/
  109. ?>
复制代码





上一篇:UTF8编码内的繁简转换的PHP类
下一篇:PHP 超链接 抓取实现代码
您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

1314学习网 ( 浙ICP备10214163号 )

GMT+8, 2025-5-2 16:34

Powered by Discuz! X3.4

© 2001-2013 Comsenz Inc.

快速回复 返回顶部 返回列表