批量抓取某个列表页的教程分享

2018-09-10 01:13:37来源:爱站网 阅读 ()

新老客户大回馈,云服务器低至5折

本篇文章介绍了批量抓取某个列表页的教程分享,下面我们就来看看详细的教程,需要的朋友可以参考下。

有些人当抓取程序是个宝,到目前还TND有人在卖钱,强烈BS一下这些家伙 真是的!可能偶下边这段东西比较烂哈
下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果

Dim?Url,List_PageCode,Array_ArticleID,i,ArticleID
Dim?Content_PageCode,Content_TempCode
Dim?Content_CategoryID,Content_CategoryName,BorderID,ClassID,BorderName,ClassName
Dim?ArticleTitle,ArticleAuthor,ArticleFrom,ArticleContent

Url?=?"http://www.webasp.net/article/class/1.htm"
List_PageCode?=?getHTTPPage(Url)
List_PageCode?=?RegExpText(List_PageCode,"打印","

List_PageCode?=?RegExpText(List_PageCode," '取得当前列表页的文章链接,以,分隔
Array_ArticleID?=?Split(List_PageCode,",")????'创建数组,存储文章ID

For?i=0?To?Ubound(Array_ArticleID)-1
????ArticleID?=?Array_ArticleID(i)????'文章ID
????Content_PageCode?=?getHTTPPage("http://www.webasp.net/article/"&ArticleID)????'取得文章页的内容

????'=========取文章分类及相关ID参数?开始=======================
????Content_TempCode?=?RegExpText(Content_PageCode,"技术教程?>>?",">>?内容",0)
????Content_CategoryID?=?RegExpText(Content_PageCode,"",1)
????BorderID?=?Split(Content_CategoryID,",")(0)????'大类ID
????ClassID?=?Split(Content_CategoryID,",")(1)????'子类ID
????????'==========检查大类是否存在?开始===============
????????'如果不存在则入库

????????'==========检查大类是否存在?结束===============
????'Response.Write(BorderID?&?","?&?ClassID?&?"
")
????Content_CategoryName?=?RegExpText(Content_PageCode,"/'>","",1)
????BorderName?=?Split(Content_CategoryName,",")(0)????'大类名称
????ClassName?=?Split(Content_CategoryName,",")(1)????'子类名称
????????'==========检查子类是否存在?开始===============
????????'如果不存在则入库

????????'==========检查子类是否存在?结束===============
????'=========取文章分类及相关ID参数?结束=======================

????'=========取文章标题及内容?开始=============================

????ArticleTitle?=?RegExpText(Content_PageCode,"","",0)
????ArticleAuthor?=?RegExpText(Content_PageCode," 作者:","",0)
????ArticleFrom?=?RegExpText(Content_PageCode," 来源:","",0)
????ArticleContent?=?RegExpText(Content_PageCode,"",""&VBCrlf&"????????"&VBCrlf&"????",0)
????'=========取文章标题及内容?结束=============================
????Response.Write(ArticleTitle&?"

")
????Response.Flush()
Next


附几个函数:

Function?getHTTPPage(url)?
????IF(IsObjInstalled("Microsoft.XMLHTTP")?=?False)THEN
????????Response.Write?"

服务器不支持Microsoft.XMLHTTP组件"?
????????Err.Clear
????????Response.End
????END?IF
????On?Error?Resume?Next
????Dim?http?
????SET?http=Server.CreateObject("Msxml2.XMLHTTP")?
????Http.open?"GET",url,False?
????Http.send()?
????IF(Http.readystate4)THEN
????????Exit?Function?
????END?IF?
????getHTTPPage=BytesToBSTR(Http.responseBody,"GB2312")
????SET?http=NOTHING
????IF(Err.number0)THEN
????????Response.Write?"

获取文件内容出错"?
????????'Response.End
????????Err.Clear
????END?IF??
End?Function

????????
Function?BytesToBstr(CodeBody,CodeSet)
????Dim?objStream
????SET?objStream?=?Server.CreateObject("adodb.stream")
????objStream.Type?=?1
????objStream.Mode?=3
????objStream.Open
????objStream.Write?CodeBody
????objStream.Position?=?0
????objStream.Type?=?2
????objStream.Charset?=?CodeSet
????BytesToBstr?=?objStream.ReadText?
????objStream.Close
????SET?objStream?=?NOTHING
End?Function

'================================================
'作??用:检查组件是否已经安装
'返回值:True??----已经安装
'????????False?----没有安装
'================================================
Function?IsObjInstalled(objName)
????On?Error?Resume?Next
????IsObjInstalled?=?False
????Err?=?0
????Dim?testObj
????SET?testObj?=?Server.CreateObject(objName)
????IF(0?=?Err)THEN?IsObjInstalled?=?True
????SET?testObj?=?NOTHING
????Err?=?0
End?Function

Function?RegExpText(strng,strStart,strEnd,n)
????Dim?regEx,Match,Matches,RetStr
????SET?regEx?=?New?RegExp
????regEx.Pattern?=?strStart&"([\s\S]*?)"&strEnd
????regEx.IgnoreCase?=?True
????regEx.Global?=?True
????SET?Matches?=?regEx.Execute(strng)
????For?Each?Match?in?Matches
????????IF(n=1)THEN
????????????RetStr?=?RetStr?&?regEx.Replace(Match.Value,"$1")?&?","?
????????ELSE
????????????RetStr?=?RetStr?&?regEx.Replace(Match.Value,"$1")
????????END?IF?
????Next
????RegExpText?=?RetStr
????SET?regEx=NOTHING
End?Function

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有

上一篇:ASP生成html的几种方法

下一篇:ASP中怎么获取真实的IP地址的方法