123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- const crawler = require('./crawler')
- const { Province, City, Area, Street, Village } = require('./sqlite')
- // 每抓取 100 个页面再批量写入数据库
- const limit = 100
- /**
- * 抓取所有省级数据
- * @author https://github.com/modood
- * @datetime 2018-01-31 22:11
- */
- exports.fetchProvinces = async () => {
- console.log('[1/1]正在抓取省级数据...')
- const o = await crawler.fetchProvinces()
- const rows = []
- for (const code in o) {
- const name = o[code]
- rows.push({ code, name })
- }
- await Province.bulkCreate(rows, { ignoreDuplicates: true })
- }
- /**
- * 抓取所有地级数据
- * @author https://github.com/modood
- * @datetime 2018-01-31 22:13
- */
- exports.fetchCities = async () => {
- await exports.fetchProvinces()
- const count = await Province.count()
- let index = 0
- let hasNext = true
- let after
- while (hasNext) {
- const r = await Province.paginate({ limit, after })
- const rows = []
- for (let i = 0; i < r.results.length; i++) {
- const { dataValues: {
- name: provinceName,
- code: provinceCode } } = r.results[i]
- index++
- console.log(`[${index}/${count}]正在抓取地级数据,当前省级:${provinceCode} ${provinceName}`)
- const o = await crawler.fetchCities(provinceCode)
- for (const code in o) {
- const name = o[code]
- rows.push({ code, name, provinceCode })
- }
- }
- await City.bulkCreate(rows, { ignoreDuplicates: true })
- hasNext = r.cursors.hasNext
- after = r.cursors.after
- }
- }
- /**
- * 获取所有县级数据
- * @author https://github.com/modood
- * @datetime 2018-02-01 09:12
- */
- exports.fetchAreas = async () => {
- await exports.fetchCities()
- const count = await City.count()
- let index = 0
- let hasNext = true
- let after
- while (hasNext) {
- const r = await City.paginate({ limit, after })
- const rows = []
- for (let i = 0; i < r.results.length; i++) {
- const { dataValues: {
- name: cityName,
- code: cityCode,
- provinceCode } } = r.results[i]
- index++
- console.log(`[${index}/${count}]正在抓取县级数据,当前地级:${cityCode} ${cityName}`)
- // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级。
- if (['4420', '4419', '4604'].includes(cityCode)) continue
- const o = await crawler.fetchAreas(cityCode)
- for (const code in o) {
- const name = o[code]
- rows.push({ code, name, cityCode, provinceCode })
- }
- }
- await Area.bulkCreate(rows, { ignoreDuplicates: true })
- hasNext = r.cursors.hasNext
- after = r.cursors.after
- }
- // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级,
- // 需要手动插入。
- await Area.bulkCreate([
- { code: '441900', name: '东莞市', cityCode: '4419', provinceCode: '44' },
- { code: '442000', name: '中山市', cityCode: '4420', provinceCode: '44' },
- { code: '460400', name: '儋州市', cityCode: '4604', provinceCode: '46' }
- ], { ignoreDuplicates: true })
- // 特殊处理:甘肃省嘉峪关市下仅一个县级名为市辖区(code: 620201),重命名。
- await Area.update({ name: '嘉峪关市' }, { where: { code: '620201' } })
- }
- /**
- * 获取所有乡级数据
- * @author https://github.com/modood
- * @datetime 2018-02-01 09:28
- */
- exports.fetchStreets = async () => {
- await exports.fetchAreas()
- const count = await Area.count()
- let index = 0
- let hasNext = true
- let after
- while (hasNext) {
- const r = await Area.paginate({ limit, after })
- const rows = []
- for (let i = 0; i < r.results.length; i++) {
- const { dataValues: {
- name: areaName,
- code: areaCode,
- cityCode,
- provinceCode } } = r.results[i]
- index++
- console.log(`[${index}/${count}]正在抓取乡级数据,当前县级:${areaCode} ${areaName}`)
- // 特殊处理:名为市辖区的县级没有乡级
- // 1. 福建省泉州市金门县(350527)也没有乡级
- // 2. 甘肃省嘉峪关市下一个县级名为市辖区(code: 620201),
- // 海南省三亚市下一个县级名为市辖区(code: 460201),
- // 但是它们有乡级,因此不可略过。
- if ((areaName === '市辖区' && !['620201', '460201'].includes(areaCode)) ||
- ['350527'].includes(areaCode)) continue
- // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的乡级
- // 页面的路由比较特别,需要手动拼接。
- let route
- if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cityCode}`
- const o = await crawler.fetchStreets(areaCode, route)
- for (const code in o) {
- const name = o[code]
- rows.push({ code, name, areaCode, cityCode, provinceCode })
- }
- }
- await Street.bulkCreate(rows, { ignoreDuplicates: true })
- hasNext = r.cursors.hasNext
- after = r.cursors.after
- }
- }
- /**
- * 抓取所有村级数据
- * @author https://github.com/modood
- * @datetime 2018-02-01 09:47
- */
- exports.fetchVillages = async () => {
- await exports.fetchStreets()
- const count = await Street.count()
- let index = 0
- let hasNext = true
- let after
- while (hasNext) {
- const r = await Street.paginate({ limit, after })
- const rows = []
- for (let i = 0; i < r.results.length; i++) {
- const { dataValues: {
- name: streetName,
- code: streetCode,
- areaCode,
- cityCode,
- provinceCode } } = r.results[i]
- index++
- if (['350527'].includes(areaCode)) {
- console.log(`[${index}/${count}]跳过例外村级数据,当前乡级:${streetCode} ${streetName}`)
- continue
- }
- console.log(`[${index}/${count}]正在抓取村级数据,当前乡级:${streetCode} ${streetName}`)
- // 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)的村级
- // 页面的路由比较特别,需要手动拼接。
- let route
- const cCodeSuffix = cityCode.substr(2, 2)
- if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cCodeSuffix}/${streetCode}`
- const o = await crawler.fetchVillages(streetCode, route)
- for (const code in o) {
- const name = o[code]
- rows.push({ code, name, streetCode, areaCode, cityCode, provinceCode })
- }
- }
- await Village.bulkCreate(rows, { ignoreDuplicates: true })
- hasNext = r.cursors.hasNext
- after = r.cursors.after
- }
- }
- /**
- * 补漏
- * @author https://github.com/modood
- * @datetime 2018-02-02 13:39
- */
- exports.patch = async () => {
- // 特殊处理:福建省泉州市金门县(350527)没有乡级导致没有匹配上爬取县级的正则表达式。
- // 手动插入县级、乡级、村级
- const areas = [
- { code: '350527', name: '金门县', cityCode: '3505', provinceCode: '35' }
- ]
- const streets = [
- { code: '350527000', name: '金门县', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
- ]
- const villages = [
- { code: '350527000000', name: '金门县', streetCode: '350527000', areaCode: '350527', cityCode: '3505', provinceCode: '35' }
- ]
- await Area.bulkCreate(areas, { ignoreDuplicates: true })
- await Street.bulkCreate(streets, { ignoreDuplicates: true })
- await Village.bulkCreate(villages, { ignoreDuplicates: true })
- }
|