Bläddra i källkod

refactor: 对 HTML 进行正则匹配前先压缩,防止 ^M 之类的特殊空白字符造成匹配失败

Fixes #34
modood 6 år sedan
förälder
incheckning
134cd79307
2 ändrade filer med 4 tillägg och 2 borttagningar
  1. 3 2
      lib/crawler.js
  2. 1 0
      package.json

+ 3 - 2
lib/crawler.js

@@ -1,6 +1,7 @@
 const http = require('http')
 
 const iconv = require('iconv-lite')
+const minify = require('html-minifier').minify
 const BufferHelper = require('bufferhelper')
 
 /*
@@ -15,7 +16,7 @@ const BufferHelper = require('bufferhelper')
 
 const pReg = /<td><a href='(.*?).html'>(.*?)<br\/><\/a><\/td>/g
 const casReg = /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
-const vReg = /<td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td>/g
+const vReg = /<tr class='.*?'><td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td><\/tr>/g
 
 const host = 'www.stats.gov.cn'
 const path = '/tjsj/tjbz/tjyqhdmhcxhfdm/2017/#{route}.html'
@@ -42,7 +43,7 @@ exports.fetch = (host, route, regexp, codeLen) =>
     res.on('data', chunk => bufferHelper.concat(chunk))
 
     res.on('end', () => {
-      const rawData = iconv.decode(bufferHelper.toBuffer(), 'GBK')
+      const rawData = minify(iconv.decode(bufferHelper.toBuffer(), 'GBK'), { collapseWhitespace: true, quoteCharacter: '\'' })
 
       const result = {}
       let current

+ 1 - 0
package.json

@@ -44,6 +44,7 @@
     "eslint-plugin-node": "^5.1.1",
     "eslint-plugin-promise": "^3.5.0",
     "eslint-plugin-standard": "^3.0.1",
+    "html-minifier": "^3.5.20",
     "husky": "^0.13.4",
     "iconv-lite": "^0.4.15",
     "lodash": "^4.17.4",