crawler.js 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. const http = require('http')
  2. const zlib = require('zlib')
  3. const iconv = require('iconv-lite')
  4. const minify = require('html-minifier').minify
  5. const BufferHelper = require('bufferhelper')
  6. /*
  7. * 命名简写备注
  8. *
  9. * 省级(省份,Province) p
  10. * 地级(城市,City) c
  11. * 县级(区县,Area) a
  12. * 乡级(乡镇街道,Street) s
  13. * 村级(村委会居委会,Village) v
  14. */
  15. const pReg = /<td><a href='(.*?).html'>(.*?)<br><\/a><\/td>/g
  16. const casReg = /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  17. const vReg = /<tr class='.*?'><td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td><\/tr>/g
  18. const host = 'www.stats.gov.cn'
  19. const path = '/sj/tjbz/tjyqhdmhcxhfdm/2022/#{route}.html'
  20. /**
  21. * 抓取数据
  22. * @author modood <https://github.com/modood>
  23. * @datetime 2018-01-31 19:23
  24. */
  25. exports.fetch = (host, route, regexp, codeLen) =>
  26. new Promise((resolve, reject) => http.get({
  27. host,
  28. path: path.replace('#{route}', route),
  29. timeout: 3000
  30. }, res => {
  31. const bufferHelper = new BufferHelper()
  32. const statusCode = res.statusCode
  33. // 302 Move Temporarily
  34. // 这种情况一般重试就可以了,所以视为超时统一重试处理
  35. if (statusCode === 302) {
  36. res.resume()
  37. return reject(new Error('timeout'))
  38. }
  39. if (statusCode !== 200) {
  40. res.resume()
  41. return reject(new Error('Request Failed. Status Code: ' + statusCode))
  42. }
  43. res.on('data', chunk => bufferHelper.concat(chunk))
  44. res.on('end', () => {
  45. let raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8')
  46. if (!raw.includes('国家统计局')) {
  47. raw = iconv.decode(zlib.gunzipSync(bufferHelper.toBuffer()), 'UTF-8')
  48. }
  49. const rawData = minify(raw, { collapseWhitespace: true, quoteCharacter: '\'' })
  50. const result = {}
  51. let current
  52. while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
  53. if (Object.keys(result).length === 0) {
  54. if (raw.includes('请开启JavaScript并刷新该页')) {
  55. console.log('\n温馨提示:请求过于频繁已被目标网站限制,当前抓取进度已保存,请五分钟后再试...\n')
  56. process.exit(0)
  57. }
  58. }
  59. return resolve(result)
  60. })
  61. }).on('error', reject).on('timeout', () => reject(new Error('timeout'))))
  62. /**
  63. * 抓取省级数据
  64. * @author modood <https://github.com/modood>
  65. * @datetime 2018-01-31 19:40
  66. */
  67. exports.fetchProvinces = async () => {
  68. try {
  69. return await exports.fetch(host, 'index', pReg, 2)
  70. } catch (err) {
  71. if (err.message !== 'timeout') console.log(`抓取省级数据失败(${err}),正在重试...`)
  72. return exports.fetchProvinces()
  73. }
  74. }
  75. /**
  76. * 抓取地级数据
  77. * @author modood <https://github.com/modood>
  78. * @datetime 2018-01-31 19:51
  79. */
  80. exports.fetchCities = async (pCode) => {
  81. try {
  82. return await exports.fetch(host, pCode, casReg, 4)
  83. } catch (err) {
  84. if (err.message !== 'timeout') console.log(`抓取省级(${pCode})的地级数据失败(${err}),正在重试...`)
  85. return exports.fetchCities(pCode)
  86. }
  87. }
  88. /**
  89. * 抓取县级数据
  90. * @author modood <https://github.com/modood>
  91. * @datetime 2018-01-31 20:03
  92. */
  93. exports.fetchAreas = async (cCode) => {
  94. cCode = cCode.toString()
  95. const pCode = cCode.substr(0, 2)
  96. try {
  97. return await exports.fetch(host, `${pCode}/${cCode}`, casReg, 6)
  98. } catch (err) {
  99. if (err.message !== 'timeout') console.log(`抓取地级(${cCode})的县级数据失败(${err}),正在重试...`)
  100. return exports.fetchAreas(cCode)
  101. }
  102. }
  103. /**
  104. * 抓取乡级数据
  105. * @author modood <https://github.com/modood>
  106. * @datetime 2018-01-31 20:08
  107. */
  108. exports.fetchStreets = async (aCode, route) => {
  109. aCode = aCode.toString()
  110. const pCode = aCode.substr(0, 2)
  111. const cCodeSuffix = aCode.substr(2, 2)
  112. const _route = route || `${pCode}/${cCodeSuffix}/${aCode}`
  113. try {
  114. return await exports.fetch(host, _route, casReg, 9)
  115. } catch (err) {
  116. if (err.message !== 'timeout') console.log(`抓取县级(${aCode})的乡级数据失败(${err}),正在重试...`)
  117. return exports.fetchStreets(aCode, route)
  118. }
  119. }
  120. /**
  121. * 抓取村级数据
  122. * @author modood <https://github.com/modood>
  123. * @datetime 2018-01-31 20:19
  124. */
  125. exports.fetchVillages = async (sCode, route) => {
  126. sCode = sCode.toString()
  127. const pCode = sCode.substr(0, 2)
  128. const cCodeSuffix = sCode.substr(2, 2)
  129. const aCodeSuffix = sCode.substr(4, 2)
  130. const _route = route || `${pCode}/${cCodeSuffix}/${aCodeSuffix}/${sCode}`
  131. try {
  132. return await exports.fetch(host, _route, vReg, 12)
  133. } catch (err) {
  134. if (err.message !== 'timeout') console.log(`抓取乡级(${sCode})的村级数据失败(${err}),正在重试...`)
  135. return exports.fetchVillages(sCode, route)
  136. }
  137. }