鉴于大家对asp十分关注,我们编辑小组在此为大家搜集整理了“写了段批量抓取某个列表页的东东”一文,供大家参考学习
有些人当抓取程序是个宝,到目前还TND有人在卖钱,强烈BS一下这些家伙 真是的!可能偶下边这段东西比较烂哈
下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果
Dim Url,List_PageCode,Array_ArticleID,i,ArticleID
Dim Content_PageCode,Content_TempCode
Dim Content_CategoryID,Content_CategoryName,BorderID,ClassID,BorderName,ClassName
Dim ArticleTitle,ArticleAuthor,ArticleFrom,ArticleContent
Url = "/uploadfile/201306/17/25154434401.gif''",1) ''取得当前列表页的文章链接,以,分隔
Array_ArticleID = Split(List_PageCode,",") ''创建数组,存储文章ID
For i=0 To Ubound(Array_ArticleID)-1
ArticleID = Array_ArticleID(i) ''文章ID
Content_PageCode = getHTTPPage("http://www.webasp.net/article/"&ArticleID) ''取得文章页的内容
''=========取文章分类及相关ID参数 开始=======================
Content_TempCode = RegExpText(Content_PageCode,"<a href=""/article/"">技术教程</a> >> ",">> 内容</td>",0)
Content_CategoryID = RegExpText(Content_PageCode,"<a href=''../class","/''>",1)
BorderID = Split(Content_CategoryID,",")(0) ''大类ID
ClassID = Split(Content_CategoryID,",")(1) ''子类ID
''==========检查大类是否存在 开始===============
''如果不存在则入库
''==========检查大类是否存在 结束===============
''Response.Write(BorderID & "," & ClassID & "<br />")
Content_CategoryName = RegExpText(Content_PageCode,"/''>","</a>",1)
BorderName = Split(Content_CategoryName,",")(0) ''大类名称
ClassName = Split(Content_CategoryName,",")(1) ''子类名称
''==========检查子类是否存在 开始===============
''如果不存在则入库
''==========检查子类是否存在 结束===============
''=========取文章分类及相关ID参数 结束=======================
''=========取文章标题及内容 开始=============================
ArticleTitle = RegExpText(Content_PageCode,"<tr><td align=center bgcolor=#DEE2F5><strong>","</strong></td></tr>",0)
ArticleAuthor = RegExpText(Content_PageCode,"<tr><td><span class=blue>作者:</span>","</td></tr>",0)
ArticleFrom = RegExpText(Content_PageCode,"<tr><td><span class=blue>来源:</span>","</td></tr>",0)
ArticleContent = RegExpText(Content_PageCode,"<tr><td class=content style=""WORD-WRAP: break-word"" id=zoom>","</td></tr>"&VBCrlf&" </table>"&VBCrlf&" </td></tr></table>",0)
''=========取文章标题及内容 结束=============================
Response.Write(ArticleTitle& "<br /><br />")
Response.Flush()
Next
附几个函数:
Function getHTTPPage(url)
IF(IsObjInstalled("Microsoft.XMLHTTP") = False)THEN
Response.Write "<br><br>服务器不支持Microsoft.XMLHTTP组件"
Err.Clear
Response.End
END IF
On Error Resume Next
Dim http
SET http=Server.CreateObjec