worker.js 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. const crawler = require('./crawler')
  2. const Sequelize = require('sequelize')
  3. const { Province, City, Area, Street, Village } = require('./sqlite')
  4. // 每抓取 100 个页面再批量写入数据库
  5. const limit = 100
  6. /**
  7. * 抓取所有省级数据
  8. * @author https://github.com/modood
  9. * @datetime 2018-01-31 22:11
  10. */
  11. exports.fetchProvinces = async () => {
  12. const count = await Province.count()
  13. if (count !== 0) {
  14. return
  15. }
  16. console.log('[1/1]正在抓取省级数据...')
  17. const o = await crawler.fetchProvinces()
  18. const rows = []
  19. for (const code in o) {
  20. const name = o[code]
  21. rows.push({ code, name })
  22. }
  23. await Province.bulkCreate(rows, { ignoreDuplicates: true })
  24. }
  25. /**
  26. * 抓取所有地级数据
  27. * @author https://github.com/modood
  28. * @datetime 2018-01-31 22:13
  29. */
  30. exports.fetchCities = async () => {
  31. await exports.fetchProvinces()
  32. const fetchedProvinceCode = await City.aggregate('provinceCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  33. const where = { code: { [Sequelize.Op.notIn]: fetchedProvinceCode } }
  34. const count = await Province.count({ where })
  35. if (count === 0) {
  36. return
  37. }
  38. let index = 0
  39. let hasNext = true
  40. let after
  41. while (hasNext) {
  42. const r = await Province.paginate({ where, limit, after })
  43. const rows = []
  44. for (let i = 0; i < r.results.length; i++) {
  45. const { dataValues: {
  46. name: provinceName,
  47. code: provinceCode } } = r.results[i]
  48. index++
  49. console.log(`[${index}/${count}]正在抓取地级数据,当前省级:${provinceCode} ${provinceName}`)
  50. const o = await crawler.fetchCities(provinceCode)
  51. for (const code in o) {
  52. const name = o[code]
  53. rows.push({ code, name, provinceCode })
  54. }
  55. }
  56. await City.bulkCreate(rows, { ignoreDuplicates: true })
  57. hasNext = r.cursors.hasNext
  58. after = r.cursors.after
  59. }
  60. }
  61. /**
  62. * 获取所有县级数据
  63. * @author https://github.com/modood
  64. * @datetime 2018-02-01 09:12
  65. */
  66. exports.fetchAreas = async () => {
  67. await exports.fetchCities()
  68. const fetchedCityCode = await Area.aggregate('cityCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  69. const where = { code: { [Sequelize.Op.notIn]: fetchedCityCode } }
  70. const count = await City.count({ where })
  71. if (count === 0) {
  72. return
  73. }
  74. let index = 0
  75. let hasNext = true
  76. let after
  77. while (hasNext) {
  78. const r = await City.paginate({ where, limit, after })
  79. const rows = []
  80. for (let i = 0; i < r.results.length; i++) {
  81. const { dataValues: {
  82. name: cityName,
  83. code: cityCode,
  84. provinceCode } } = r.results[i]
  85. index++
  86. console.log(`[${index}/${count}]正在抓取县级数据,当前地级:${cityCode} ${cityName}`)
  87. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级。
  88. if (['4420', '4419', '4604'].includes(cityCode)) continue
  89. const o = await crawler.fetchAreas(cityCode)
  90. for (const code in o) {
  91. const name = o[code]
  92. rows.push({ code, name, cityCode, provinceCode })
  93. }
  94. }
  95. await Area.bulkCreate(rows, { ignoreDuplicates: true })
  96. hasNext = r.cursors.hasNext
  97. after = r.cursors.after
  98. }
  99. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级,
  100. // 需要手动插入。
  101. await Area.bulkCreate([
  102. { code: '441900', name: '东莞市', cityCode: '4419', provinceCode: '44' },
  103. { code: '442000', name: '中山市', cityCode: '4420', provinceCode: '44' },
  104. { code: '460400', name: '儋州市', cityCode: '4604', provinceCode: '46' }
  105. ], { ignoreDuplicates: true })
  106. // 特殊处理:甘肃省嘉峪关市下仅一个县级名为市辖区(code: 620201),重命名。
  107. await Area.update({ name: '嘉峪关市' }, { where: { code: '620201' } })
  108. }
  109. /**
  110. * 获取所有乡级数据
  111. * @author https://github.com/modood
  112. * @datetime 2018-02-01 09:28
  113. */
  114. exports.fetchStreets = async () => {
  115. await exports.fetchAreas()
  116. const fetchedAreaCode = await Street.aggregate('areaCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  117. const where = { code: { [Sequelize.Op.notIn]: fetchedAreaCode } }
  118. const count = await Area.count({ where })
  119. if (count === 0) {
  120. return
  121. }
  122. let index = 0
  123. let hasNext = true
  124. let after
  125. while (hasNext) {
  126. const r = await Area.paginate({ where, limit, after })
  127. const rows = []
  128. for (let i = 0; i < r.results.length; i++) {
  129. const { dataValues: {
  130. name: areaName,
  131. code: areaCode,
  132. cityCode,
  133. provinceCode } } = r.results[i]
  134. index++
  135. console.log(`[${index}/${count}]正在抓取乡级数据,当前县级:${areaCode} ${areaName}`)
  136. // 特殊处理:名为市辖区的县级没有乡级
  137. // 1. 福建省泉州市金门县(350527)也没有乡级
  138. // 2. 甘肃省嘉峪关市下一个县级名为市辖区(code: 620201),
  139. // 海南省三亚市下一个县级名为市辖区(code: 460201),
  140. // 但是它们有乡级,因此不可略过。
  141. if ((areaName === '市辖区' && !['620201', '460201'].includes(areaCode)) ||
  142. ['350527'].includes(areaCode)) continue
  143. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的乡级
  144. // 页面的路由比较特别,需要手动拼接。
  145. let route
  146. if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cityCode}`
  147. const o = await crawler.fetchStreets(areaCode, route)
  148. for (const code in o) {
  149. const name = o[code]
  150. rows.push({ code, name, areaCode, cityCode, provinceCode })
  151. }
  152. }
  153. await Street.bulkCreate(rows, { ignoreDuplicates: true })
  154. hasNext = r.cursors.hasNext
  155. after = r.cursors.after
  156. }
  157. }
  158. /**
  159. * 抓取所有村级数据
  160. * @author https://github.com/modood
  161. * @datetime 2018-02-01 09:47
  162. */
  163. exports.fetchVillages = async () => {
  164. await exports.fetchStreets()
  165. const fetchedStreetCode = await Village.aggregate('streetCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  166. const where = { code: { [Sequelize.Op.notIn]: fetchedStreetCode } }
  167. const count = await Street.count({ where })
  168. if (count === 0) {
  169. return
  170. }
  171. let index = 0
  172. let hasNext = true
  173. let after
  174. while (hasNext) {
  175. const r = await Street.paginate({ where, limit, after })
  176. const rows = []
  177. for (let i = 0; i < r.results.length; i++) {
  178. const { dataValues: {
  179. name: streetName,
  180. code: streetCode,
  181. areaCode,
  182. cityCode,
  183. provinceCode } } = r.results[i]
  184. index++
  185. if (['350527'].includes(areaCode)) {
  186. console.log(`[${index}/${count}]跳过例外村级数据,当前乡级:${streetCode} ${streetName}`)
  187. continue
  188. }
  189. console.log(`[${index}/${count}]正在抓取村级数据,当前乡级:${streetCode} ${streetName}`)
  190. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的村级
  191. // 页面的路由比较特别,需要手动拼接。
  192. let route
  193. const cCodeSuffix = cityCode.substr(2, 2)
  194. if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cCodeSuffix}/${streetCode}`
  195. const o = await crawler.fetchVillages(streetCode, route)
  196. for (const code in o) {
  197. const name = o[code]
  198. rows.push({ code, name, streetCode, areaCode, cityCode, provinceCode })
  199. }
  200. }
  201. await Village.bulkCreate(rows, { ignoreDuplicates: true })
  202. hasNext = r.cursors.hasNext
  203. after = r.cursors.after
  204. }
  205. }
  206. /**
  207. * 补漏
  208. * @author https://github.com/modood
  209. * @datetime 2018-02-02 13:39
  210. */
  211. exports.patch = async () => {
  212. // 特殊处理:福建省泉州市金门县(350527)没有乡级导致没有匹配上爬取县级的正则表达式。
  213. // 手动插入县级、乡级、村级
  214. const areas = [
  215. { code: '350527', name: '金门县', cityCode: '3505', provinceCode: '35' }
  216. ]
  217. const streets = [
  218. { code: '350527000', name: '金门县', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
  219. ]
  220. const villages = [
  221. { code: '350527000000', name: '金门县', streetCode: '350527000', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
  222. ]
  223. await Area.bulkCreate(areas, { ignoreDuplicates: true })
  224. await Street.bulkCreate(streets, { ignoreDuplicates: true })
  225. await Village.bulkCreate(villages, { ignoreDuplicates: true })
  226. }