worker.js 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. const crawler = require('./crawler')
  2. const Sequelize = require('sequelize')
  3. const { Province, City, Area, Street, Village } = require('./sqlite')
  4. // 每抓取 100 个页面再批量写入数据库
  5. const limit = 100
  6. /**
  7. * 抓取所有省级数据
  8. * @author https://github.com/modood
  9. * @datetime 2018-01-31 22:11
  10. */
  11. exports.fetchProvinces = async () => {
  12. console.log('[1/1]正在抓取省级数据...')
  13. const o = await crawler.fetchProvinces()
  14. const rows = []
  15. for (const code in o) {
  16. const name = o[code]
  17. rows.push({ code, name })
  18. }
  19. await Province.bulkCreate(rows, { ignoreDuplicates: true })
  20. }
  21. /**
  22. * 抓取所有地级数据
  23. * @author https://github.com/modood
  24. * @datetime 2018-01-31 22:13
  25. */
  26. exports.fetchCities = async () => {
  27. await exports.fetchProvinces()
  28. const count = await Province.count()
  29. let index = 0
  30. let hasNext = true
  31. let after
  32. while (hasNext) {
  33. const r = await Province.paginate({ limit, after })
  34. const rows = []
  35. for (let i = 0; i < r.results.length; i++) {
  36. const { dataValues: {
  37. name: provinceName,
  38. code: provinceCode } } = r.results[i]
  39. index++
  40. console.log(`[${index}/${count}]正在抓取地级数据,当前省级:${provinceCode} ${provinceName}`)
  41. const o = await crawler.fetchCities(provinceCode)
  42. for (const code in o) {
  43. const name = o[code]
  44. rows.push({ code, name, provinceCode })
  45. }
  46. }
  47. await City.bulkCreate(rows, { ignoreDuplicates: true })
  48. hasNext = r.cursors.hasNext
  49. after = r.cursors.after
  50. }
  51. }
  52. /**
  53. * 获取所有县级数据
  54. * @author https://github.com/modood
  55. * @datetime 2018-02-01 09:12
  56. */
  57. exports.fetchAreas = async () => {
  58. await exports.fetchCities()
  59. const fetchedCityCode = await Area.aggregate('cityCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  60. const where = { code: { [Sequelize.Op.notIn]: fetchedCityCode } }
  61. const count = await City.count({ where })
  62. if (count === 0) {
  63. return
  64. }
  65. let index = 0
  66. let hasNext = true
  67. let after
  68. while (hasNext) {
  69. const r = await City.paginate({ where, limit, after })
  70. const rows = []
  71. for (let i = 0; i < r.results.length; i++) {
  72. const { dataValues: {
  73. name: cityName,
  74. code: cityCode,
  75. provinceCode } } = r.results[i]
  76. index++
  77. console.log(`[${index}/${count}]正在抓取县级数据,当前地级:${cityCode} ${cityName}`)
  78. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级。
  79. if (['4420', '4419', '4604'].includes(cityCode)) continue
  80. const o = await crawler.fetchAreas(cityCode)
  81. for (const code in o) {
  82. const name = o[code]
  83. rows.push({ code, name, cityCode, provinceCode })
  84. }
  85. }
  86. await Area.bulkCreate(rows, { ignoreDuplicates: true })
  87. hasNext = r.cursors.hasNext
  88. after = r.cursors.after
  89. }
  90. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级,
  91. // 需要手动插入。
  92. await Area.bulkCreate([
  93. { code: '441900', name: '东莞市', cityCode: '4419', provinceCode: '44' },
  94. { code: '442000', name: '中山市', cityCode: '4420', provinceCode: '44' },
  95. { code: '460400', name: '儋州市', cityCode: '4604', provinceCode: '46' }
  96. ], { ignoreDuplicates: true })
  97. // 特殊处理:甘肃省嘉峪关市下仅一个县级名为市辖区(code: 620201),重命名。
  98. await Area.update({ name: '嘉峪关市' }, { where: { code: '620201' } })
  99. }
  100. /**
  101. * 获取所有乡级数据
  102. * @author https://github.com/modood
  103. * @datetime 2018-02-01 09:28
  104. */
  105. exports.fetchStreets = async () => {
  106. await exports.fetchAreas()
  107. const fetchedAreaCode = await Street.aggregate('areaCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  108. const where = { code: { [Sequelize.Op.notIn]: fetchedAreaCode } }
  109. const count = await Area.count({ where })
  110. if (count === 0) {
  111. return
  112. }
  113. let index = 0
  114. let hasNext = true
  115. let after
  116. while (hasNext) {
  117. const r = await Area.paginate({ where, limit, after })
  118. const rows = []
  119. for (let i = 0; i < r.results.length; i++) {
  120. const { dataValues: {
  121. name: areaName,
  122. code: areaCode,
  123. cityCode,
  124. provinceCode } } = r.results[i]
  125. index++
  126. console.log(`[${index}/${count}]正在抓取乡级数据,当前县级:${areaCode} ${areaName}`)
  127. // 特殊处理:名为市辖区的县级没有乡级
  128. // 1. 福建省泉州市金门县(350527)也没有乡级
  129. // 2. 甘肃省嘉峪关市下一个县级名为市辖区(code: 620201),
  130. // 海南省三亚市下一个县级名为市辖区(code: 460201),
  131. // 但是它们有乡级,因此不可略过。
  132. if ((areaName === '市辖区' && !['620201', '460201'].includes(areaCode)) ||
  133. ['350527'].includes(areaCode)) continue
  134. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的乡级
  135. // 页面的路由比较特别,需要手动拼接。
  136. let route
  137. if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cityCode}`
  138. const o = await crawler.fetchStreets(areaCode, route)
  139. for (const code in o) {
  140. const name = o[code]
  141. rows.push({ code, name, areaCode, cityCode, provinceCode })
  142. }
  143. }
  144. await Street.bulkCreate(rows, { ignoreDuplicates: true })
  145. hasNext = r.cursors.hasNext
  146. after = r.cursors.after
  147. }
  148. }
  149. /**
  150. * 抓取所有村级数据
  151. * @author https://github.com/modood
  152. * @datetime 2018-02-01 09:47
  153. */
  154. exports.fetchVillages = async () => {
  155. await exports.fetchStreets()
  156. const fetchedStreetCode = await Village.aggregate('streetCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
  157. const where = { code: { [Sequelize.Op.notIn]: fetchedStreetCode } }
  158. const count = await Street.count({ where })
  159. if (count === 0) {
  160. return
  161. }
  162. let index = 0
  163. let hasNext = true
  164. let after
  165. while (hasNext) {
  166. const r = await Street.paginate({ where, limit, after })
  167. const rows = []
  168. for (let i = 0; i < r.results.length; i++) {
  169. const { dataValues: {
  170. name: streetName,
  171. code: streetCode,
  172. areaCode,
  173. cityCode,
  174. provinceCode } } = r.results[i]
  175. index++
  176. if (['350527'].includes(areaCode)) {
  177. console.log(`[${index}/${count}]跳过例外村级数据,当前乡级:${streetCode} ${streetName}`)
  178. continue
  179. }
  180. console.log(`[${index}/${count}]正在抓取村级数据,当前乡级:${streetCode} ${streetName}`)
  181. // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的村级
  182. // 页面的路由比较特别,需要手动拼接。
  183. let route
  184. const cCodeSuffix = cityCode.substr(2, 2)
  185. if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cCodeSuffix}/${streetCode}`
  186. const o = await crawler.fetchVillages(streetCode, route)
  187. for (const code in o) {
  188. const name = o[code]
  189. rows.push({ code, name, streetCode, areaCode, cityCode, provinceCode })
  190. }
  191. }
  192. await Village.bulkCreate(rows, { ignoreDuplicates: true })
  193. hasNext = r.cursors.hasNext
  194. after = r.cursors.after
  195. }
  196. }
  197. /**
  198. * 补漏
  199. * @author https://github.com/modood
  200. * @datetime 2018-02-02 13:39
  201. */
  202. exports.patch = async () => {
  203. // 特殊处理:福建省泉州市金门县(350527)没有乡级导致没有匹配上爬取县级的正则表达式。
  204. // 手动插入县级、乡级、村级
  205. const areas = [
  206. { code: '350527', name: '金门县', cityCode: '3505', provinceCode: '35' }
  207. ]
  208. const streets = [
  209. { code: '350527000', name: '金门县', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
  210. ]
  211. const villages = [
  212. { code: '350527000000', name: '金门县', streetCode: '350527000', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
  213. ]
  214. await Area.bulkCreate(areas, { ignoreDuplicates: true })
  215. await Street.bulkCreate(streets, { ignoreDuplicates: true })
  216. await Village.bulkCreate(villages, { ignoreDuplicates: true })
  217. }