crawler.js 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. const http = require('http')
  2. const iconv = require('iconv-lite')
  3. const BufferHelper = require('bufferhelper')
  4. /*
  5. * 备注:命名简写描述
  6. * 省份(Province) p
  7. * 城市(City) c
  8. * 区县(Area) a
  9. * 乡镇街道(Street) s
  10. * 村(居)委会(Village) v
  11. */
  12. const host = 'www.stats.gov.cn'
  13. const links = [
  14. {
  15. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html',
  16. regexp: /<td><a href='(.*?).html'>(.*?)<br\/><\/a><\/td>/g
  17. },
  18. {
  19. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}.html',
  20. regexp: /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  21. },
  22. {
  23. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}/#{cCode}.html',
  24. regexp: /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  25. },
  26. {
  27. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}/#{cCodeSuffix}/#{aCode}.html',
  28. regexp: /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
  29. },
  30. {
  31. path: '/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{pCode}/#{cCodeSuffix}/#{aCodeSuffix}/#{sCode}.html',
  32. regexp: /<td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td>/g
  33. }
  34. ]
  35. /**
  36. * 抓取数据
  37. * @author modood <https://github.com/modood>
  38. * @datetime 2018-01-31 19:23
  39. */
  40. exports.fetch = (host, path, regexp) =>
  41. new Promise((resolve, reject) => http.get({ host, path, timeout: 1000 }, res => {
  42. const bufferHelper = new BufferHelper()
  43. const statusCode = res.statusCode
  44. if (statusCode !== 200) {
  45. res.resume()
  46. return reject(new Error('Request Failed. Status Code: ' + statusCode))
  47. }
  48. res.on('data', chunk => bufferHelper.concat(chunk))
  49. res.on('end', () => {
  50. const rawData = iconv.decode(bufferHelper.toBuffer(), 'GBK')
  51. const result = {}
  52. let current
  53. while ((current = regexp.exec(rawData)) !== null) result[current[1]] = current[2].trim()
  54. return resolve(result)
  55. })
  56. }).on('error', reject).on('timeout', () => reject(new Error('timeout'))))
  57. /**
  58. * 抓取省份数据
  59. * @author modood <https://github.com/modood>
  60. * @datetime 2018-01-31 19:40
  61. */
  62. exports.fetchProvinces = async () => {
  63. try {
  64. return await exports.fetch(host, links[0].path, links[0].regexp)
  65. } catch (err) {
  66. console.error(err)
  67. process.exit(-1)
  68. }
  69. }
  70. /**
  71. * 抓取城市数据
  72. * @author modood <https://github.com/modood>
  73. * @datetime 2018-01-31 19:51
  74. */
  75. exports.fetchCities = async (pCode) => {
  76. const path = links[1].path.replace('#{pCode}', pCode)
  77. try {
  78. return await exports.fetch(host, path, links[1].regexp)
  79. } catch (err) {
  80. console.log(`抓取省份(${pCode})的城市数据失败(${err}),正在重试...`)
  81. return this.fetchCities(pCode)
  82. }
  83. }
  84. /**
  85. * 抓取区县数据
  86. * @author modood <https://github.com/modood>
  87. * @datetime 2018-01-31 20:03
  88. */
  89. exports.fetchAreas = async (cCode) => {
  90. cCode = cCode.toString()
  91. const pCode = cCode.substr(0, 2)
  92. const path = links[2].path
  93. .replace('#{pCode}', pCode)
  94. .replace('#{cCode}', cCode)
  95. try {
  96. return await exports.fetch(host, path, links[2].regexp)
  97. } catch (err) {
  98. console.log(`抓取城市(${cCode})的区县数据失败(${err}),正在重试...`)
  99. return this.fetchAreas(cCode)
  100. }
  101. }
  102. /**
  103. * 抓取街道数据
  104. * @author modood <https://github.com/modood>
  105. * @datetime 2018-01-31 20:08
  106. */
  107. exports.fetchStreets = async (aCode) => {
  108. aCode = aCode.toString()
  109. const pCode = aCode.substr(0, 2)
  110. const cCodeSuffix = aCode.substr(2, 2)
  111. const path = links[3].path
  112. .replace('#{pCode}', pCode)
  113. .replace('#{cCodeSuffix}', cCodeSuffix)
  114. .replace('#{aCode}', aCode)
  115. try {
  116. return await exports.fetch(host, path, links[3].regexp)
  117. } catch (err) {
  118. console.log(`抓取区县(${aCode})的乡镇街道数据失败(${err}),正在重试...`)
  119. return this.fetchStreets(aCode)
  120. }
  121. }
  122. /**
  123. * 抓取村(居)委会数据
  124. * @author modood <https://github.com/modood>
  125. * @datetime 2018-01-31 20:19
  126. */
  127. exports.fetchVillages = async (sCode) => {
  128. sCode = sCode.toString()
  129. const pCode = sCode.substr(0, 2)
  130. const cCodeSuffix = sCode.substr(2, 2)
  131. const aCodeSuffix = sCode.substr(4, 2)
  132. const path = links[4].path
  133. .replace('#{pCode}', pCode)
  134. .replace('#{cCodeSuffix}', cCodeSuffix)
  135. .replace('#{aCodeSuffix}', aCodeSuffix)
  136. .replace('#{sCode}', sCode)
  137. try {
  138. return await exports.fetch(host, path, links[4].regexp)
  139. } catch (err) {
  140. console.log(`抓取乡镇街道(${sCode})的村(居)委会数据失败(${err}),正在重试...`)
  141. return this.fetchVillages(sCode)
  142. }
  143. }