Эх сурвалжийг харах

refactor: 添加更多抓取异常情况处理

modood 5 жил өмнө
parent
commit
66f42aed1c
2 өөрчлөгдсөн 26 нэмэгдсэн , 3 устгасан
  1. 12 1
      lib/crawler.js
  2. 14 2
      lib/worker.js

+ 12 - 1
lib/crawler.js

@@ -35,6 +35,13 @@ exports.fetch = (host, route, regexp, codeLen) =>
     const bufferHelper = new BufferHelper()
     const statusCode = res.statusCode
 
+    // 302 Move Temporarily
+    // 这种情况一般重试就可以了,所以视为超时统一重试处理
+    if (statusCode === 302) {
+      res.resume()
+      return reject(new Error('timeout'))
+    }
+
     if (statusCode !== 200) {
       res.resume()
       return reject(new Error('Request Failed. Status Code: ' + statusCode))
@@ -49,7 +56,11 @@ exports.fetch = (host, route, regexp, codeLen) =>
       let current
       while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
       if (Object.keys(result).length === 0) {
-        return reject(new Error('Request Failed. rawData: '), rawData)
+        const raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8')
+        if (raw.includes('请开启JavaScript并刷新该页')) {
+          console.log('\n温馨提示:请求过于频繁已被目标网站限制,当前抓取进度已保存,请五分钟后再试...\n')
+          process.exit(0)
+        }
       }
 
       return resolve(result)

+ 14 - 2
lib/worker.js

@@ -11,6 +11,11 @@ const limit = 100
  * @datetime 2018-01-31 22:11
  */
 exports.fetchProvinces = async () => {
+  const count = await Province.count()
+  if (count !== 0) {
+    return
+  }
+
   console.log('[1/1]正在抓取省级数据...')
   const o = await crawler.fetchProvinces()
   const rows = []
@@ -29,12 +34,19 @@ exports.fetchProvinces = async () => {
 exports.fetchCities = async () => {
   await exports.fetchProvinces()
 
-  const count = await Province.count()
+  const fetchedProvinceCode = await City.aggregate('provinceCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
+  const where = { code: { [Sequelize.Op.notIn]: fetchedProvinceCode } }
+  const count = await Province.count({ where })
+
+  if (count === 0) {
+    return
+  }
+
   let index = 0
   let hasNext = true
   let after
   while (hasNext) {
-    const r = await Province.paginate({ limit, after })
+    const r = await Province.paginate({ where, limit, after })
     const rows = []
     for (let i = 0; i < r.results.length; i++) {
       const { dataValues: {