crawler.js 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. const http = require('http')
  2. const iconv = require('iconv-lite')
  3. const minify = require('html-minifier').minify
  4. const BufferHelper = require('bufferhelper')
  5. /*
  6. * 命名简写备注
  7. *
  8. * 省级(省份,Province) p
  9. * 地级(城市,City) c
  10. * 县级(区县,Area) a
  11. * 乡级(乡镇街道,Street) s
  12. * 村级(村委会居委会,Village) v
  13. */
  14. const pReg = /<td><a href='(.*?).html'>(.*?)<br><\/a><\/td>/g
  15. const casReg = /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  16. const vReg = /<tr class='.*?'><td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td><\/tr>/g
  17. const host = 'www.stats.gov.cn'
  18. const path = '/tjsj/tjbz/tjyqhdmhcxhfdm/2019/#{route}.html'
  19. /**
  20. * 抓取数据
  21. * @author modood <https://github.com/modood>
  22. * @datetime 2018-01-31 19:23
  23. */
  24. exports.fetch = (host, route, regexp, codeLen) =>
  25. new Promise((resolve, reject) => http.get({
  26. host,
  27. path: path.replace('#{route}', route),
  28. timeout: 3000
  29. }, res => {
  30. const bufferHelper = new BufferHelper()
  31. const statusCode = res.statusCode
  32. if (statusCode !== 200) {
  33. res.resume()
  34. return reject(new Error('Request Failed. Status Code: ' + statusCode))
  35. }
  36. res.on('data', chunk => bufferHelper.concat(chunk))
  37. res.on('end', () => {
  38. const rawData = minify(iconv.decode(bufferHelper.toBuffer(), 'GBK'), { collapseWhitespace: true, quoteCharacter: '\'' })
  39. const result = {}
  40. let current
  41. while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
  42. if (Object.keys(result).length === 0) {
  43. return reject(new Error('Request Failed. rawData: '), rawData)
  44. }
  45. return resolve(result)
  46. })
  47. }).on('error', reject).on('timeout', () => reject(new Error('timeout'))))
  48. /**
  49. * 抓取省级数据
  50. * @author modood <https://github.com/modood>
  51. * @datetime 2018-01-31 19:40
  52. */
  53. exports.fetchProvinces = async () => {
  54. try {
  55. return await exports.fetch(host, 'index', pReg, 2)
  56. } catch (err) {
  57. if (err.message !== 'timeout') console.log(`抓取省级数据失败(${err}),正在重试...`)
  58. return exports.fetchProvinces()
  59. }
  60. }
  61. /**
  62. * 抓取地级数据
  63. * @author modood <https://github.com/modood>
  64. * @datetime 2018-01-31 19:51
  65. */
  66. exports.fetchCities = async (pCode) => {
  67. try {
  68. return await exports.fetch(host, pCode, casReg, 4)
  69. } catch (err) {
  70. if (err.message !== 'timeout') console.log(`抓取省级(${pCode})的地级数据失败(${err}),正在重试...`)
  71. return exports.fetchCities(pCode)
  72. }
  73. }
  74. /**
  75. * 抓取县级数据
  76. * @author modood <https://github.com/modood>
  77. * @datetime 2018-01-31 20:03
  78. */
  79. exports.fetchAreas = async (cCode) => {
  80. cCode = cCode.toString()
  81. const pCode = cCode.substr(0, 2)
  82. try {
  83. return await exports.fetch(host, `${pCode}/${cCode}`, casReg, 6)
  84. } catch (err) {
  85. if (err.message !== 'timeout') console.log(`抓取地级(${cCode})的县级数据失败(${err}),正在重试...`)
  86. return exports.fetchAreas(cCode)
  87. }
  88. }
  89. /**
  90. * 抓取乡级数据
  91. * @author modood <https://github.com/modood>
  92. * @datetime 2018-01-31 20:08
  93. */
  94. exports.fetchStreets = async (aCode, route) => {
  95. aCode = aCode.toString()
  96. const pCode = aCode.substr(0, 2)
  97. const cCodeSuffix = aCode.substr(2, 2)
  98. const _route = route || `${pCode}/${cCodeSuffix}/${aCode}`
  99. try {
  100. return await exports.fetch(host, _route, casReg, 9)
  101. } catch (err) {
  102. if (err.message !== 'timeout') console.log(`抓取县级(${aCode})的乡级数据失败(${err}),正在重试...`)
  103. return exports.fetchStreets(aCode, route)
  104. }
  105. }
  106. /**
  107. * 抓取村级数据
  108. * @author modood <https://github.com/modood>
  109. * @datetime 2018-01-31 20:19
  110. */
  111. exports.fetchVillages = async (sCode, route) => {
  112. sCode = sCode.toString()
  113. const pCode = sCode.substr(0, 2)
  114. const cCodeSuffix = sCode.substr(2, 2)
  115. const aCodeSuffix = sCode.substr(4, 2)
  116. const _route = route || `${pCode}/${cCodeSuffix}/${aCodeSuffix}/${sCode}`
  117. try {
  118. return await exports.fetch(host, _route, vReg, 12)
  119. } catch (err) {
  120. if (err.message !== 'timeout') console.log(`抓取乡级(${sCode})的村级数据失败(${err}),正在重试...`)
  121. return exports.fetchVillages(sCode, route)
  122. }
  123. }