crawler.js 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. const http = require('http')
  2. const iconv = require('iconv-lite')
  3. const BufferHelper = require('bufferhelper')
  4. /*
  5. * 命名简写备注
  6. *
  7. * 省级(省份,Province) p
  8. * 地级(城市,City) c
  9. * 县级(区县,Area) a
  10. * 乡级(乡镇街道,Street) s
  11. * 村级(村委会居委会,Village) v
  12. */
  13. const host = 'www.stats.gov.cn'
  14. const links = [
  15. {
  16. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html',
  17. regexp: /<td><a href='(.*?).html'>(.*?)<br\/><\/a><\/td>/g
  18. },
  19. {
  20. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}.html',
  21. regexp: /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  22. },
  23. {
  24. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}/#{cCode}.html',
  25. regexp: /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  26. },
  27. {
  28. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}/#{cCodeSuffix}/#{aCode}.html',
  29. regexp: /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  30. },
  31. {
  32. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}/#{cCodeSuffix}/#{aCodeSuffix}/#{sCode}.html',
  33. regexp: /<td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td>/g
  34. }
  35. ]
  36. /**
  37. * 抓取数据
  38. * @author modood <https://github.com/modood>
  39. * @datetime 2018-01-31 19:23
  40. */
  41. exports.fetch = (host, path, regexp, codeLen) =>
  42. new Promise((resolve, reject) => http.get({ host, path, timeout: 3000 }, res => {
  43. const bufferHelper = new BufferHelper()
  44. const statusCode = res.statusCode
  45. if (statusCode !== 200) {
  46. res.resume()
  47. return reject(new Error('Request Failed. Status Code: ' + statusCode))
  48. }
  49. res.on('data', chunk => bufferHelper.concat(chunk))
  50. res.on('end', () => {
  51. const rawData = iconv.decode(bufferHelper.toBuffer(), 'GBK')
  52. const result = {}
  53. let current
  54. while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
  55. return resolve(result)
  56. })
  57. }).on('error', reject).on('timeout', () => reject(new Error('timeout'))))
  58. /**
  59. * 抓取省级数据
  60. * @author modood <https://github.com/modood>
  61. * @datetime 2018-01-31 19:40
  62. */
  63. exports.fetchProvinces = async () => {
  64. try {
  65. return await exports.fetch(host, links[0].path, links[0].regexp, 2)
  66. } catch (err) {
  67. console.log(`抓取省级数据失败(${err}),正在重试...`)
  68. return this.fetchProvinces()
  69. }
  70. }
  71. /**
  72. * 抓取地级数据
  73. * @author modood <https://github.com/modood>
  74. * @datetime 2018-01-31 19:51
  75. */
  76. exports.fetchCities = async (pCode) => {
  77. const path = links[1].path.replace('#{pCode}', pCode)
  78. try {
  79. return await exports.fetch(host, path, links[1].regexp, 4)
  80. } catch (err) {
  81. console.log(`抓取省级(${pCode})的地级数据失败(${err}),正在重试...`)
  82. return this.fetchCities(pCode)
  83. }
  84. }
  85. /**
  86. * 抓取县级数据
  87. * @author modood <https://github.com/modood>
  88. * @datetime 2018-01-31 20:03
  89. */
  90. exports.fetchAreas = async (cCode) => {
  91. cCode = cCode.toString()
  92. const pCode = cCode.substr(0, 2)
  93. const path = links[2].path
  94. .replace('#{pCode}', pCode)
  95. .replace('#{cCode}', cCode)
  96. try {
  97. return await exports.fetch(host, path, links[2].regexp, 6)
  98. } catch (err) {
  99. console.log(`抓取地级(${cCode})的县级数据失败(${err}),正在重试...`)
  100. console.log(path)
  101. return this.fetchAreas(cCode)
  102. }
  103. }
  104. /**
  105. * 抓取乡级数据
  106. * @author modood <https://github.com/modood>
  107. * @datetime 2018-01-31 20:08
  108. */
  109. exports.fetchStreets = async (aCode) => {
  110. aCode = aCode.toString()
  111. const pCode = aCode.substr(0, 2)
  112. const cCodeSuffix = aCode.substr(2, 2)
  113. const path = links[3].path
  114. .replace('#{pCode}', pCode)
  115. .replace('#{cCodeSuffix}', cCodeSuffix)
  116. .replace('#{aCode}', aCode)
  117. try {
  118. return await exports.fetch(host, path, links[3].regexp, 9)
  119. } catch (err) {
  120. console.log(`抓取县级(${aCode})的乡级数据失败(${err}),正在重试...`)
  121. console.log(path)
  122. return this.fetchStreets(aCode)
  123. }
  124. }
  125. /**
  126. * 抓取村级数据
  127. * @author modood <https://github.com/modood>
  128. * @datetime 2018-01-31 20:19
  129. */
  130. exports.fetchVillages = async (sCode) => {
  131. sCode = sCode.toString()
  132. const pCode = sCode.substr(0, 2)
  133. const cCodeSuffix = sCode.substr(2, 2)
  134. const aCodeSuffix = sCode.substr(4, 2)
  135. const path = links[4].path
  136. .replace('#{pCode}', pCode)
  137. .replace('#{cCodeSuffix}', cCodeSuffix)
  138. .replace('#{aCodeSuffix}', aCodeSuffix)
  139. .replace('#{sCode}', sCode)
  140. try {
  141. return await exports.fetch(host, path, links[4].regexp, 12)
  142. } catch (err) {
  143. console.log(`抓取乡级(${sCode})的村级数据失败(${err}),正在重试...`)
  144. return this.fetchVillages(sCode)
  145. }
  146. }