worker.js 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. const crawler = require('./crawler')
  2. const { Province, City, Area, Street, Village } = require('./sqlite')
  3. // 每抓取 100 个页面再批量写入数据库
  4. const limit = 100
  5. /**
  6. * 抓取所有省级数据
  7. * @author https://github.com/modood
  8. * @datetime 2018-01-31 22:11
  9. */
  10. exports.fetchProvinces = async () => {
  11. console.log('[1/1]正在抓取省级数据...')
  12. const o = await crawler.fetchProvinces()
  13. const rows = []
  14. for (const code in o) {
  15. const name = o[code]
  16. rows.push({ code, name })
  17. }
  18. await Province.bulkCreate(rows, { ignoreDuplicates: true })
  19. }
  20. /**
  21. * 抓取所有地级数据
  22. * @author https://github.com/modood
  23. * @datetime 2018-01-31 22:13
  24. */
  25. exports.fetchCities = async () => {
  26. await exports.fetchProvinces()
  27. const count = await Province.count()
  28. let index = 0
  29. let hasNext = true
  30. let after
  31. while (hasNext) {
  32. const r = await Province.paginate({ limit, after })
  33. const rows = []
  34. for (let i = 0; i < r.results.length; i++) {
  35. const { dataValues: {
  36. name: provinceName,
  37. code: provinceCode } } = r.results[i]
  38. index++
  39. console.log(`[${index}/${count}]正在抓取地级数据,当前省级:${provinceCode} ${provinceName}`)
  40. const o = await crawler.fetchCities(provinceCode)
  41. for (const code in o) {
  42. const name = o[code]
  43. rows.push({ code, name, provinceCode })
  44. }
  45. }
  46. await City.bulkCreate(rows, { ignoreDuplicates: true })
  47. hasNext = r.cursors.hasNext
  48. after = r.cursors.after
  49. }
  50. }
  51. /**
  52. * 获取所有县级数据
  53. * @author https://github.com/modood
  54. * @datetime 2018-02-01 09:12
  55. */
  56. exports.fetchAreas = async () => {
  57. await exports.fetchCities()
  58. const count = await City.count()
  59. let index = 0
  60. let hasNext = true
  61. let after
  62. while (hasNext) {
  63. const r = await City.paginate({ limit, after })
  64. const rows = []
  65. for (let i = 0; i < r.results.length; i++) {
  66. const { dataValues: {
  67. name: cityName,
  68. code: cityCode,
  69. provinceCode } } = r.results[i]
  70. index++
  71. console.log(`[${index}/${count}]正在抓取县级数据,当前地级:${cityCode} ${cityName}`)
  72. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级。
  73. if (['4420', '4419', '4604'].includes(cityCode)) continue
  74. const o = await crawler.fetchAreas(cityCode)
  75. for (const code in o) {
  76. const name = o[code]
  77. rows.push({ code, name, cityCode, provinceCode })
  78. }
  79. }
  80. await Area.bulkCreate(rows, { ignoreDuplicates: true })
  81. hasNext = r.cursors.hasNext
  82. after = r.cursors.after
  83. }
  84. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级,
  85. // 需要手动插入。
  86. await Area.bulkCreate([
  87. { code: '441900', name: '东莞市', cityCode: '4419', provinceCode: '44' },
  88. { code: '442000', name: '中山市', cityCode: '4420', provinceCode: '44' },
  89. { code: '460400', name: '儋州市', cityCode: '4604', provinceCode: '46' }
  90. ], { ignoreDuplicates: true })
  91. // 特殊处理:甘肃省嘉峪关市下仅一个县级名为市辖区(code: 620201),重命名。
  92. await Area.update({ name: '嘉峪关市' }, { where: { code: '620201' } })
  93. }
  94. /**
  95. * 获取所有乡级数据
  96. * @author https://github.com/modood
  97. * @datetime 2018-02-01 09:28
  98. */
  99. exports.fetchStreets = async () => {
  100. await exports.fetchAreas()
  101. const count = await Area.count()
  102. let index = 0
  103. let hasNext = true
  104. let after
  105. while (hasNext) {
  106. const r = await Area.paginate({ limit, after })
  107. const rows = []
  108. for (let i = 0; i < r.results.length; i++) {
  109. const { dataValues: {
  110. name: areaName,
  111. code: areaCode,
  112. cityCode,
  113. provinceCode } } = r.results[i]
  114. index++
  115. console.log(`[${index}/${count}]正在抓取乡级数据,当前县级:${areaCode} ${areaName}`)
  116. // 特殊处理:名为市辖区的县级没有乡级
  117. // 1. 福建省泉州市金门县(350527)也没有乡级
  118. // 2. 甘肃省嘉峪关市下一个县级名为市辖区(code: 620201),
  119. // 海南省三亚市下一个县级名为市辖区(code: 460201),
  120. // 但是它们有乡级,因此不可略过。
  121. if ((areaName === '市辖区' && !['620201', '460201'].includes(areaCode)) ||
  122. ['350527'].includes(areaCode)) continue
  123. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的乡级
  124. // 页面的路由比较特别,需要手动拼接。
  125. let route
  126. if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cityCode}`
  127. const o = await crawler.fetchStreets(areaCode, route)
  128. for (const code in o) {
  129. const name = o[code]
  130. rows.push({ code, name, areaCode, cityCode, provinceCode })
  131. }
  132. }
  133. await Street.bulkCreate(rows, { ignoreDuplicates: true })
  134. hasNext = r.cursors.hasNext
  135. after = r.cursors.after
  136. }
  137. }
  138. /**
  139. * 抓取所有村级数据
  140. * @author https://github.com/modood
  141. * @datetime 2018-02-01 09:47
  142. */
  143. exports.fetchVillages = async () => {
  144. await exports.fetchStreets()
  145. const count = await Street.count()
  146. let index = 0
  147. let hasNext = true
  148. let after
  149. while (hasNext) {
  150. const r = await Street.paginate({ limit, after })
  151. const rows = []
  152. for (let i = 0; i < r.results.length; i++) {
  153. const { dataValues: {
  154. name: streetName,
  155. code: streetCode,
  156. areaCode,
  157. cityCode,
  158. provinceCode } } = r.results[i]
  159. index++
  160. if (['350527'].includes(areaCode)) {
  161. console.log(`[${index}/${count}]跳过例外村级数据,当前乡级:${streetCode} ${streetName}`)
  162. continue
  163. }
  164. console.log(`[${index}/${count}]正在抓取村级数据,当前乡级:${streetCode} ${streetName}`)
  165. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的村级
  166. // 页面的路由比较特别,需要手动拼接。
  167. let route
  168. const cCodeSuffix = cityCode.substr(2, 2)
  169. if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cCodeSuffix}/${streetCode}`
  170. const o = await crawler.fetchVillages(streetCode, route)
  171. for (const code in o) {
  172. const name = o[code]
  173. rows.push({ code, name, streetCode, areaCode, cityCode, provinceCode })
  174. }
  175. }
  176. await Village.bulkCreate(rows, { ignoreDuplicates: true })
  177. hasNext = r.cursors.hasNext
  178. after = r.cursors.after
  179. }
  180. }
  181. /**
  182. * 补漏
  183. * @author https://github.com/modood
  184. * @datetime 2018-02-02 13:39
  185. */
  186. exports.patch = async () => {
  187. // 特殊处理:福建省泉州市金门县(350527)没有乡级导致没有匹配上爬取县级的正则表达式。
  188. // 手动插入县级、乡级、村级
  189. const areas = [
  190. { code: '350527', name: '金门县', cityCode: '3505', provinceCode: '35' }
  191. ]
  192. const streets = [
  193. { code: '350527000', name: '金门县', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
  194. ]
  195. const villages = [
  196. { code: '350527000000', name: '金门县', streetCode: '350527000', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
  197. ]
  198. await Area.bulkCreate(areas, { ignoreDuplicates: true })
  199. await Street.bulkCreate(streets, { ignoreDuplicates: true })
  200. await Village.bulkCreate(villages, { ignoreDuplicates: true })
  201. }