这几天偶尔看见了,C#抓取网页的链接。的代码。感觉当时做的很简单。呵呵。也没多考虑什么过程。先把简单的给大家拿出来看看。如果大家有什么意见或者有好的方法可以共同交流。谢谢!一下仅供参考:
- using System;
- using System.Collections.Generic;
- using System.ComponentModel;
- using System.Data;
- using System.Drawing;
- using System.Linq;
- using System.Text;
- using System.Windows.Forms;
- using System.Xml;
- using System.Net;
- using System.IO;
- using System.Collections;
- using System.Text.RegularExpressions;
- namespace text
- {
- public partial class Form1 : Form
- {
- string strCode;
- ArrayList alLinks;
- public Form1()
- {
- InitializeComponent();
- }
- private void button1_Click(object sender, EventArgs e)
- {
- if (textBox1.Text == "")
- {
- MessageBox.Show("请输入网址");
- return;
- }
- string strURL = textBox1.Text.ToString().Trim();
- if (strURL.Substring(0, 7) != @"http://")
- {
- strURL = @"http://" + strURL;
- }
- MessageBox.Show("正在获取页面代码,请稍后");
- strCode = GetPageSource(strURL);
- MessageBox.Show("正在提取超链接,请稍侯");
- alLinks = GetHyperLinks(strCode);
- MessageBox.Show("正在写入文件,请稍侯");
- WriteToXml(strURL, alLinks);
- }
- // 获取指定网页的HTML代码
- public static string GetPageSource(string URL)
- {
- Uri uri = new Uri(URL);
- HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
- HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
- hwReq.Method = "Get";
- hwReq.KeepAlive = false;
- StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
- return reader.ReadToEnd();
- }
- // 提取HTML代码中的网址
- public static ArrayList GetHyperLinks(string htmlCode)
- {
- ArrayList al = new ArrayList();
- string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
- Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
- MatchCollection m = r.Matches(htmlCode);
- for (int i = 0; i <= m.Count - 1; i++)
- {
- bool rep = false;
- string strNew = m[i].ToString();
- // 过滤重复的URL
- foreach (string str in al)
- {
- if (strNew == str)
- {
- rep = true;
- break;
- }
- }
- if (!rep) al.Add(strNew);
- }
- al.Sort();
- return al;
- }
- // 把网址写入xml文件
- static void WriteToXml(string strURL, ArrayList alHyperLinks)
- {
- XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8);
- writer.Formatting = Formatting.Indented;
- writer.WriteStartDocument(false);
- writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
- writer.WriteComment("提取自" + strURL + "的超链接");
- writer.WriteStartElement("HyperLinks");
- writer.WriteStartElement("HyperLinks", null);
- writer.WriteAttributeString("DateTime", DateTime.Now.ToString());
- foreach (string str in alHyperLinks)
- {
- string title = GetDomain(str);
- string body = str;
- writer.WriteElementString(title, null, body);
- }
- writer.WriteEndElement();
- writer.WriteEndElement();
- writer.Flush();
- writer.Close();
- }
- // 获取网址的域名后缀
- static string GetDomain(string strURL)
- {
- string retVal;
- string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
- Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
- Match m = r.Match(strURL);
- retVal = m.ToString();
- strRegex = @"\.|/contentquot;;
- retVal = Regex.Replace(retVal, strRegex, "").ToString();
- if (retVal == "")
- retVal = "other";
- return retVal;
- }
- }
- }