首页 / 爬虫 / C#网页爬虫抓取行政区划
C#网页爬虫抓取行政区划
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了C#网页爬虫抓取行政区划,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含6350字,纯文字阅读大概需要10分钟。
内容图文
借鉴C#网页爬虫抓取行政区划,从国家统计局获取了最新行政区域数据。
以下为代码贴片:
数据库类:
public class City { public decimal ID { get; set; } publicstring Name { get; set; } publicstring Code { get; set; } publicstring Org_Level { get; set; } publicstring ParentCode { get; set; } publicdecimal ParentID { get; set; } publicstring Contry { get; set; } publicstring Loc_x { get; set; } publicstring Loc_y { get; set; } }
获取网页帮助类:
1 public class HttpHelper { 2 private static ILog log = log4net.LogManager.GetLogger(typeof(HttpHelper)); 3 4publicstaticstring DownloadHtml(string url,Encoding encod) { 5string html = string.Empty; 6try { 7//设置请求参数 8 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest; 9 request.Timeout = 10 * 1000;//10s超时10 request.ContentType = "text/html;charset=utf-8"; 11 request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"; 1213//获取结果14using(HttpWebResponse resp = request.GetResponse() as HttpWebResponse) { 15if(resp.StatusCode != HttpStatusCode.OK) { 16 log.Fatal(string.Format("抓取{0}地址返回失败,response.StatusCode = {1}",url,resp.StatusCode)); 17 } else { 18try { 19 StreamReader sr = new StreamReader(resp.GetResponseStream(),encod); 20 html = sr.ReadToEnd(); 21 sr.Close(); 22 } catch(Exception e) { 23 log.Fatal(string.Format("DownLoadHtml抓取html{0}保存失败",url),e); 2425 } 26 } 27 } 28 } catch(Exception e) { 29if(e.Message.Equals("远程服务器返回错误:(306)。")) { 30 } 31 log.Fatal(e); 32 } finally { 33 } 34return html; 35 } 36 }
数据库保存帮助类:
public class SQLHelper { /// 一个有效的数据库连接对象 /// 命令类型(存储过程,命令文本或其它.) /// T存储过程名称或T-SQL语句 /// SqlParamter参数数组 /// 返回影响的行数 public static int ExecuteNonQueryForCity(List<City> cityList) { int count = 0; //string dbConnectStr = System.Configuration.ConfigurationSettings.AppSettings["DBContext"].ToString();var connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["DBContext"].ConnectionString; using(SqlConnection connection = new SqlConnection(connectionString)) { if(connection.State != ConnectionState.Open) { connection.Open(); } // 创建SqlCommand命令,并进行预处理 using(SqlCommand cmd = new SqlCommand()) { cmd.Connection = connection; cmd.CommandText = "insert into base_city(ID,name,Code,Contry,Loc_x,Loc_y,Org_Level,ParentCode,ParentID,state) values(@ID,@name,@Code,@Contry,@Loc_x,@Loc_y,@Org_Level,@ParentCode,@ParentID,@state)"; foreach(var city in cityList) { try { if(string.IsNullOrEmpty(city.Name)) city.Name = ""; if(string.IsNullOrEmpty(city.Code)) city.Code = ""; if(string.IsNullOrEmpty(city.Contry)) city.Contry = ""; if(string.IsNullOrEmpty(city.Loc_x)) city.Loc_x = ""; if(string.IsNullOrEmpty(city.Loc_y)) city.Loc_y = ""; if(string.IsNullOrEmpty(city.Org_Level)) city.Org_Level = ""; if(string.IsNullOrEmpty(city.ParentCode)) city.ParentCode = ""; cmd.Parameters.Add(new SqlParameter("@ID",city.ID)); cmd.Parameters.Add(new SqlParameter("@name",city.Name)); cmd.Parameters.Add(new SqlParameter("@Code",city.Code)); cmd.Parameters.Add(new SqlParameter("@Contry",city.Contry)); cmd.Parameters.Add(new SqlParameter("@Loc_x",city.Loc_x)); cmd.Parameters.Add(new SqlParameter("@Loc_y",city.Loc_y)); cmd.Parameters.Add(new SqlParameter("@Org_Level",city.Org_Level)); cmd.Parameters.Add(new SqlParameter("@ParentCode",city.ParentCode)); cmd.Parameters.Add(new SqlParameter("@ParentID",city.ParentID)); cmd.Parameters.Add(new SqlParameter("@state","1")); // Finally, execute the command int retval = cmd.ExecuteNonQuery(); if(retval == 0) { Console.WriteLine("插入错误:"); } count += retval; } catch(Exception e) { Console.WriteLine("插入错误:" + e.Message); } // 清除参数,以便再次使用. cmd.Parameters.Clear(); } } connection.Close(); } return count; } }
抓取数据:
public class 省市县数据抓取 { private ILog log = log4net.LogManager.GetLogger(typeof(省市县数据抓取)); publicconststring UrlStr = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html"; public List<City> SaveList = new List<City>(); public 省市县数据抓取() { try { log.Info("抓取数据"); string HtmlStr = HttpHelper.DownloadHtml(UrlStr,Encoding.UTF8); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(HtmlStr); //string goodsListPath = "//*[@id=‘J_goodsList‘]"; //HtmlNode goodsListNode = doc.DocumentNode.SelectSingleNode(goodsListPath);string liPath = "//p[@class=‘MsoNormal‘]"; HtmlNodeCollection goodsNodeCollection = doc.DocumentNode.SelectNodes(liPath); City c = new City() { ID=1, Name = "全国", Code = "100000", Contry = "China", Org_Level = "1" }; SaveList.Add(c); foreach(var item in goodsNodeCollection) { var firstNode = item.FirstChild; if(firstNode.Name == "b") GetProvince(item); elseif(firstNode.InnerText == "") { GetCity(item); } elseif(firstNode.InnerText == "") { GetCounty(item); } } } catch(Exception e) { log.Info("last child code:" + SaveList.Last().Code); log.Info(e); throw (e); } } privatevoid GetCounty(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[1].InnerText.Replace("","").Trim(); c.Name = item.ChildNodes[2].InnerText.Trim(); c.Org_Level = "4"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "3"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; //if(c.Name == "市辖区") // return; SaveList.Add(c); } privatevoid GetCity(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[1].InnerText.Replace("","").Trim(); c.Name = item.ChildNodes[2].InnerText.Trim(); c.Org_Level = "3"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "2"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; SaveList.Add(c); } privatevoid GetProvince(HtmlNode item) { City c = new City(); c.Code = item.ChildNodes[0].FirstChild.InnerText.Replace("","").Trim(); c.Name = item.ChildNodes[1].FirstChild.InnerText.Trim(); c.Org_Level = "2"; c.ID = SaveList.Last().ID + 1; var pc = SaveList.Last(i => i.Org_Level == "1"); c.ParentCode = pc.Code; c.ParentID = pc.ID; c.Contry = "China"; SaveList.Add(c); } publicvoid Save() { log.Info("保存数据"); SQLHelper.ExecuteNonQueryForCity(SaveList); } }
全国 Org_Level =1
省 Org_Level =2
市 Org_Level =3
县 Org_Level =4
SaveList 首先添加了一个全国属性城市,Org_Level =1
因为网页数据读取是从 省->市->县 ->省->市->县 这样循环读取的,所以在获取省、市、县的父级时,可以直接从SaveList 获取最后一个上一级别的对象即可
执行类:
省市县数据抓取 CityCatch = new 省市县数据抓取(); CityCatch.Save();
获取的数据如下:
原文:http://www.cnblogs.com/managersi/p/6941218.html
内容总结
以上是互联网集市为您收集整理的C#网页爬虫抓取行政区划全部内容,希望文章能够帮你解决C#网页爬虫抓取行政区划所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。