瀏覽代碼

refactor: 确保数据抓取无遗漏

modood 5 年之前
父節點
當前提交
5369f94831
共有 3 個文件被更改,包括 32 次插入7 次删除
  1. 3 0
      lib/crawler.js
  2. 28 6
      lib/worker.js
  3. 1 1
      package.json

+ 3 - 0
lib/crawler.js

@@ -48,6 +48,9 @@ exports.fetch = (host, route, regexp, codeLen) =>
       const result = {}
       let current
       while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
+      if (Object.keys(result).length === 0) {
+        return reject(new Error('Request Failed. rawData: '), rawData)
+      }
 
       return resolve(result)
     })

+ 28 - 6
lib/worker.js

@@ -1,4 +1,5 @@
 const crawler = require('./crawler')
+const Sequelize = require('sequelize')
 const { Province, City, Area, Street, Village } = require('./sqlite')
 
 // 每抓取 100 个页面再批量写入数据库
@@ -62,12 +63,19 @@ exports.fetchCities = async () => {
 exports.fetchAreas = async () => {
   await exports.fetchCities()
 
-  const count = await City.count()
+  const fetchedCityCode = await Area.aggregate('cityCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
+  const where = { code: { [Sequelize.Op.notIn]: fetchedCityCode } }
+  const count = await City.count({ where })
+
+  if (count === 0) {
+    return
+  }
+
   let index = 0
   let hasNext = true
   let after
   while (hasNext) {
-    const r = await City.paginate({ limit, after })
+    const r = await City.paginate({ where, limit, after })
     const rows = []
     for (let i = 0; i < r.results.length; i++) {
       const { dataValues: {
@@ -112,12 +120,19 @@ exports.fetchAreas = async () => {
 exports.fetchStreets = async () => {
   await exports.fetchAreas()
 
-  const count = await Area.count()
+  const fetchedAreaCode = await Street.aggregate('areaCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
+  const where = { code: { [Sequelize.Op.notIn]: fetchedAreaCode } }
+  const count = await Area.count({ where })
+
+  if (count === 0) {
+    return
+  }
+
   let index = 0
   let hasNext = true
   let after
   while (hasNext) {
-    const r = await Area.paginate({ limit, after })
+    const r = await Area.paginate({ where, limit, after })
     const rows = []
     for (let i = 0; i < r.results.length; i++) {
       const { dataValues: {
@@ -162,12 +177,19 @@ exports.fetchStreets = async () => {
 exports.fetchVillages = async () => {
   await exports.fetchStreets()
 
-  const count = await Street.count()
+  const fetchedStreetCode = await Village.aggregate('streetCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
+  const where = { code: { [Sequelize.Op.notIn]: fetchedStreetCode } }
+  const count = await Street.count({ where })
+
+  if (count === 0) {
+    return
+  }
+
   let index = 0
   let hasNext = true
   let after
   while (hasNext) {
-    const r = await Street.paginate({ limit, after })
+    const r = await Street.paginate({ where, limit, after })
     const rows = []
     for (let i = 0; i < r.results.length; i++) {
       const { dataValues: {

+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
   "name": "china-division",
-  "version": "2.3.0",
+  "version": "2.3.1",
   "description": "中华人民共和国行政区划:省份、城市、区县、乡镇(街道)、村(居)委会",
   "main": "lib/export.js",
   "scripts": {