|
@@ -1,4 +1,5 @@
|
|
|
const http = require('http')
|
|
|
+const zlib = require('zlib')
|
|
|
|
|
|
const iconv = require('iconv-lite')
|
|
|
const minify = require('html-minifier').minify
|
|
@@ -19,7 +20,7 @@ const casReg = /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>
|
|
|
const vReg = /<tr class='.*?'><td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td><\/tr>/g
|
|
|
|
|
|
const host = 'www.stats.gov.cn'
|
|
|
-const path = '/tjsj/tjbz/tjyqhdmhcxhfdm/2021/#{route}.html'
|
|
|
+const path = '/tjsj/tjbz/tjyqhdmhcxhfdm/2022/#{route}.html'
|
|
|
|
|
|
/**
|
|
|
* 抓取数据
|
|
@@ -50,13 +51,16 @@ exports.fetch = (host, route, regexp, codeLen) =>
|
|
|
res.on('data', chunk => bufferHelper.concat(chunk))
|
|
|
|
|
|
res.on('end', () => {
|
|
|
- const rawData = minify(iconv.decode(bufferHelper.toBuffer(), 'UTF-8'), { collapseWhitespace: true, quoteCharacter: '\'' })
|
|
|
+ let raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8')
|
|
|
+ if (!raw.includes('国家统计局')) {
|
|
|
+ raw = iconv.decode(zlib.gunzipSync(bufferHelper.toBuffer()), 'UTF-8')
|
|
|
+ }
|
|
|
+ const rawData = minify(raw, { collapseWhitespace: true, quoteCharacter: '\'' })
|
|
|
|
|
|
const result = {}
|
|
|
let current
|
|
|
while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
|
|
|
if (Object.keys(result).length === 0) {
|
|
|
- const raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8')
|
|
|
if (raw.includes('请开启JavaScript并刷新该页')) {
|
|
|
console.log('\n温馨提示:请求过于频繁已被目标网站限制,当前抓取进度已保存,请五分钟后再试...\n')
|
|
|
process.exit(0)
|