spider.js 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. 'use strict'
  2. var http = require('http')
  3. var async = require('async')
  4. var iconv = require('iconv-lite')
  5. var BufferHelper = require('bufferhelper')
  6. var i = 0
  7. /**
  8. * 从国家统计局(http://www.stats.gov.cn/)抓取县级以及县级以上行政区划数据
  9. * @author modood <https://github.com/modood>
  10. * @datetime 2016-12-19 16:32
  11. */
  12. function fetch (callback) {
  13. // 数据截止2015年9月30日(发布时间:2016-08-09 11:28)
  14. http.get('http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html', function (res) {
  15. var rawData = ''
  16. var statusCode = res.statusCode
  17. console.log('[1/1] 正在抓取省份、城市和区县数据...')
  18. if (statusCode !== 200) {
  19. res.resume()
  20. return callback(new Error('Request Failed. Status Code: ' + statusCode))
  21. }
  22. res.setEncoding('utf8')
  23. res.on('data', function (chunk) {
  24. rawData += chunk
  25. })
  26. res.on('end', function () {
  27. var current
  28. var result = {}
  29. var reg = /<span lang="EN-US">(.*?)<span>&nbsp;&nbsp;&nbsp;&nbsp; <\/span><\/span><span style="font-family: 宋体">(.*?)<\/span>/g
  30. while ((current = reg.exec(rawData)) !== null) {
  31. result[current[1]] = current[2].trim()
  32. }
  33. return callback(null, result)
  34. })
  35. })
  36. .on('error', callback)
  37. }
  38. /**
  39. * 从国家统计局(http://www.stats.gov.cn/)抓取城乡行政区划数据
  40. * @author modood <https://github.com/modood>
  41. * @datetime 2016-12-19 16:35
  42. */
  43. function fetchStreets (area, total, callback) {
  44. var html = ''
  45. var areaCode = area.code
  46. var areaName = area.name
  47. // 两个特殊城市单独处理(中山市和东莞市没有县级行政区划)
  48. switch (areaCode) {
  49. case '441900': html = '44/4419.html'; break
  50. case '442000': html = '44/4420.html'; break
  51. default: html = areaCode.substr(0, 2) + '/' + areaCode.substr(2, 2) + '/' + areaCode + '.html'
  52. }
  53. // 数据截止2015年9月30日(发布时间:2016-07-27)
  54. http.get('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/' + html, function (res) {
  55. var bufferHelper = new BufferHelper()
  56. var statusCode = res.statusCode
  57. if (['441900', '442000'].indexOf(areaCode) === -1) {
  58. console.log('[' + ++i + '/' + total + '] 正在抓取乡镇数据,当前区县:', areaCode, areaName)
  59. }
  60. if (statusCode !== 200) {
  61. res.resume()
  62. return callback(null, {})
  63. }
  64. res.on('data', function (chunk) {
  65. bufferHelper.concat(chunk)
  66. })
  67. res.on('end', function () {
  68. var rawData = iconv.decode(bufferHelper.toBuffer(), 'GBK')
  69. var current
  70. var result = {}
  71. var reg = /<tr class='towntr'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  72. while ((current = reg.exec(rawData)) !== null) {
  73. result[current[1]] = current[2].trim()
  74. }
  75. return callback(null, result)
  76. })
  77. })
  78. .on('error', function () {
  79. console.log('连接超时,马上重试...')
  80. fetchStreets(area, total, callback)
  81. })
  82. }
  83. /**
  84. * 提取省份、城市和区县数据
  85. * @author modood <https://github.com/modood>
  86. * @datetime 2016-12-20 13:18
  87. */
  88. function pick (callback) {
  89. fetch(function (err, data) {
  90. var provinces = []
  91. var cities = []
  92. var areas = []
  93. if (err) return callback(err)
  94. for (var k in data) {
  95. if (k.substr(2, 4) === '0000') {
  96. // 省份数据
  97. provinces.push({
  98. code: k,
  99. name: data[k]
  100. })
  101. } else if (k.substr(4, 2) === '00' && k.substr(2, 4) !== '0000') {
  102. // 城市数据
  103. cities.push({
  104. code: k,
  105. name: data[k],
  106. parent_code: k.substr(0, 2) + '0000'
  107. })
  108. } else if (k.substr(4, 2) !== '00') {
  109. // 区县数据
  110. areas.push({
  111. code: k,
  112. name: data[k],
  113. parent_code: k.substr(0, 4) + '00'
  114. })
  115. }
  116. }
  117. return callback(null, {
  118. provinces: provinces,
  119. cities: cities,
  120. areas: areas
  121. })
  122. })
  123. }
  124. /**
  125. * 提取乡镇数据
  126. * @author modood <https://github.com/modood>
  127. * @datetime 2016-12-20 13:17
  128. */
  129. function pickStreets (areas, callback) {
  130. var streets = []
  131. async.mapLimit(areas, 10, function (item, cb) {
  132. fetchStreets(item, areas.length, function (err, data) {
  133. if (err) return cb(err)
  134. for (var k in data) {
  135. // 乡镇数据
  136. streets.push({
  137. code: k,
  138. name: data[k],
  139. parent_code: k.substr(0, 6)
  140. })
  141. }
  142. return cb(null)
  143. })
  144. }, function (err) {
  145. if (err) console.log('getStreets timeout, ignored:\n', err)
  146. return callback(null, streets)
  147. })
  148. }
  149. /**
  150. * 两个特殊城市单独处理(中山市和东莞市没有县级行政区划)
  151. * @author modood <https://github.com/modood>
  152. * @datetime 2016-12-20 15:11
  153. */
  154. function handleSpecialCities (callback) {
  155. var areas = [
  156. {
  157. code: '442000',
  158. name: '中山市',
  159. parent_code: '442000'
  160. },
  161. {
  162. code: '441900',
  163. name: '东莞市',
  164. parent_code: '441900'
  165. }
  166. ]
  167. var streets = []
  168. async.each(areas, function (area, cb) {
  169. fetchStreets(area, areas.length, function (err, data) {
  170. if (err) return cb(err)
  171. for (var k in data) {
  172. streets.push({
  173. code: k,
  174. name: data[k],
  175. parent_code: k.substr(0, 6)
  176. })
  177. }
  178. return cb(null)
  179. })
  180. }, function (err, result) {
  181. if (err) return callback(err)
  182. return callback(null, {
  183. areas: areas,
  184. streets: streets
  185. })
  186. })
  187. }
  188. /**
  189. * 对抓取到的数据进行处理,提取出“省份”、“城市”、“区县”和“乡镇”四种数据
  190. * @author modood <https://github.com/modood>
  191. * @datetime 2016-12-19 16:37
  192. */
  193. exports.getData = function (callback) {
  194. async.auto({
  195. pca: function (cb) {
  196. pick(cb)
  197. },
  198. streets: ['pca', function (result, cb) {
  199. var areas = result.pca.areas
  200. pickStreets(areas, cb)
  201. }]
  202. }, function (err, result) {
  203. if (err) return callback(err)
  204. handleSpecialCities(function (err, r) {
  205. if (err) return callback(err)
  206. var areas = result.pca.areas.concat(r.areas)
  207. var streets = result.streets.concat(r.streets)
  208. return callback(null, {
  209. provinces: result.pca.provinces,
  210. cities: result.pca.cities,
  211. areas: areas,
  212. streets: streets
  213. })
  214. })
  215. })
  216. }