/** * メイン処理 */ // ライブラリ読み込み var client = require('cheerio-httpcli'); var request = require('request'); var URL = require('url'); var fs = require('fs'); var path = require('path'); // 読み込み階層を今回は3までに指定 var LINK_LEVEL = 3; // ダウンロード先 var TARGET_URL = "http://docs.opencv.org/3.0.0/"; var list = {}; // ダウンロード関数実行 downloadRec(TARGET_URL, 0); /* * 関数定義 */ function downloadRec(url, level){ // 読み込み階層を超えてたら実行終了 if( level >= LINK_LEVEL){ return; } // listに追加されたurlがTRUE(ダウンロード済み)であれば抜ける if(list[url]){ return; } list[url] = true; // 外部ドメインは無視する var us = TARGET_URL.split("/"); us.pop(); var base = us.join("/"); if(url.indexOf(base) < 0){ return; } client.fetch(url, {}, function(err, $, res){ // エラーが返されたら終了 if(err){ console.log("Error", err); console.log("RESPONCE", res); } // aタグをすべてチェック $("a").each(function(idx){ var href = $(this).attr("href"); // リンク先がなければ終了 if(!href){ return; } // 相対パスを絶対パスへ変更 href = URL.resolve(url, href); // アンカーリンクは無視 href = href.replace(/\#.+$/, ""); // リンク先のURLに対して再帰的に実行 downloadRec(href, level + 1); }); // URLの最後が’/’だけだったらindex.htmlを足す if(url.substr(url.length-1, 1) == "/"){ url += "index.html"; } // 3層までなのでURLを一旦’/’で分割し、最後から2つの要素を取り出し、再度’/’をつけてディレクトリパスにする。 var savepath = url.split("/").slice(2).join("/"); checkSaveDir(savepath); console.log(savepath); fs.writeFileSync(savepath, $.html()); }); } // 該当のディレクトリがあるかどうかチェック function checkSaveDir(fname){ var dir = path.dirname(fname); var dirlist = dir.split("/"); var p = ""; for(var i in dirlist){ p += dirlist[i] + "/"; if(!fs.existsSync(p)){ fs.mkdirSync(p); } } }順調に行ってたら、このメッセージが出た。エラー処理を入れて確認してみる。
getall.js:37 $("a").each(function(idx){ ^ TypeError: $ is not a function at Object.callback (/root/WebCrawler-NetAgent/ch02/getall.js:37:3) at Object.module.exports.fail (/root/node_modules/cheerio-httpcli/lib/client.js:59:15) at Object.<anonymous> (/root/node_modules/cheerio-httpcli/lib/client.js:191:21) at Object.<anonymous> (/root/node_modules/cheerio-httpcli/lib/client.js:129:16) at self.callback (/root/node_modules/cheerio-httpcli/node_modules/request/request.js:198:22) at emitOne (events.js:77:13) at Request.emit (events.js:169:7) at null._onTimeout (/root/node_modules/cheerio-httpcli/node_modules/request/request.js:811:12) at Timer.listOnTimeout (timers.js:92:15)client.fetchの一番上にエラー処理を入れてみた。
: : client.fetch(url, {}, function(err, $, res){ if(err){ console.log("Error", err); console.log("RESPONCE", res); return; } $("a").each(function(idx){ var href = $(this).attr("href"); if(!href){ return; } : :すると、約300ページほどタイムアウトで取得できていないことがわかった。ただURLを直接ブラウザで確認すると問題なさそうなので、なんでかなと思いつつ一旦スルー。ちなみにこんなエラー。
Error { [Error: ETIMEDOUT] code: 'ETIMEDOUT', connect: true, url: 'http://docs.opencv.org/3.0.0/dd/de2/classcv_1_1AutoLock.html', param: {} } RESPONCE undefined Error { [Error: ETIMEDOUT] code: 'ETIMEDOUT', connect: true, url: 'http://docs.opencv.org/3.0.0/d7/d7b/classcv_1_1BackgroundSubtractorMOG2.html', param: {} } RESPONCE undefined Error { [Error: ETIMEDOUT] code: 'ETIMEDOUT', connect: true, url: 'http://docs.opencv.org/3.0.0/d0/d2e/classcv_1_1CommandLineParser.html', param: {} }こんなのが300個ほど。(正確には332個だった) 取得したのを確認してみるとこんな感じ。 あと404も1つあった。これは実際のサイトのURLをたどっても404だったのでプログラム的には問題なし。
Error { [Error: server status] url: 'http://docs.opencv.org/3.0.0//3.0-last-rst', param: {}, statusCode: 404 } RESPONCE IncomingMessage { _readableState: ReadableState { objectMode: false, highWaterMark: 16384, buffer: [], length: 0, pipes: null, pipesCount: 0, flowing: true, ended: true, endEmitted: true, reading: false, sync: false, needReadable: false, emittedReadable: false, readableListening: false, defaultEncoding: 'utf8', ranOut: false, awaitDrain: 0, readingMore: false, decoder: null, encoding: null, resumeScheduled: false }, readable: false, domain: null, _events: { end: [ [Function: responseOnEnd], [Function], [Function], [Function] ], close: [ [Function], [Function] ], data: [Function], error: [Function] }, _eventsCount: 4, _maxListeners: undefined, socket: Socket { _connecting: false, _hadError: false, _handle: null, _parent: null, _host: 'docs.opencv.org', _readableState: ReadableState { objectMode: false, highWaterMark: 16384, buffer: [], length: 0, pipes: null, pipesCount: 0, flowing: true, ended: true, endEmitted: true, reading: false, sync: false, needReadable: false, emittedReadable: false, readableListening: false, defaultEncoding: 'utf8', ranOut: false, awaitDrain: 0, readingMore: false, decoder: null, encoding: null, resumeScheduled: false }, readable: false, domain: null, _events: { end: [Object], finish: [Function: onSocketFinish], _socketEnd: [Function: onSocketEnd], free: [Function: onFree], close: [Object], agentRemove: [Function: onRemove], drain: [Function: ondrain], error: [Object], data: [Function: socketOnData], timeout: [Object] }, _eventsCount: 10, _maxListeners: 0, _writableState: WritableState { objectMode: false, highWaterMark: 16384, needDrain: false, ending: true, ended: true, finished: true, decodeStrings: false, defaultEncoding: 'utf8', length: 0, writing: false, corked: 0, sync: false, bufferProcessing: false, onwrite: [Function], writecb: null, writelen: 0, bufferedRequest: null, lastBufferedRequest: null, pendingcb: 0, prefinished: true, errorEmitted: false }, writable: false, allowHalfOpen: false, destroyed: true, bytesRead: 383, _bytesDispatched: 292, _sockname: null, _pendingData: null, _pendingEncoding: '', parser: null, _httpMessage: ClientRequest { domain: null, _events: [Object], _eventsCount: 5, _maxListeners: undefined, output: [], outputEncodings: [], outputCallbacks: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedHeader: {}, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: [Circular], connection: [Circular], _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n', _headers: [Object], _headerNames: [Object], _onPendingData: null, agent: [Object], socketPath: undefined, method: 'GET', path: '/3.0.0//3.0-last-rst', timeoutCb: [Function: emitTimeout], parser: null, res: [Circular] }, _idleTimeout: -1, _idleNext: null, _idlePrev: null, _idleStart: 4136, read: [Function], _consuming: true, write: [Function: writeAfterFIN] }, connection: Socket { _connecting: false, _hadError: false, _handle: null, _parent: null, _host: 'docs.opencv.org', _readableState: ReadableState { objectMode: false, highWaterMark: 16384, buffer: [], length: 0, pipes: null, pipesCount: 0, flowing: true, ended: true, endEmitted: true, reading: false, sync: false, needReadable: false, emittedReadable: false, readableListening: false, defaultEncoding: 'utf8', ranOut: false, awaitDrain: 0, readingMore: false, decoder: null, encoding: null, resumeScheduled: false }, readable: false, domain: null, _events: { end: [Object], finish: [Function: onSocketFinish], _socketEnd: [Function: onSocketEnd], free: [Function: onFree], close: [Object], agentRemove: [Function: onRemove], drain: [Function: ondrain], error: [Object], data: [Function: socketOnData], timeout: [Object] }, _eventsCount: 10, _maxListeners: 0, _writableState: WritableState { objectMode: false, highWaterMark: 16384, needDrain: false, ending: true, ended: true, finished: true, decodeStrings: false, defaultEncoding: 'utf8', length: 0, writing: false, corked: 0, sync: false, bufferProcessing: false, onwrite: [Function], writecb: null, writelen: 0, bufferedRequest: null, lastBufferedRequest: null, pendingcb: 0, prefinished: true, errorEmitted: false }, writable: false, allowHalfOpen: false, destroyed: true, bytesRead: 383, _bytesDispatched: 292, _sockname: null, _pendingData: null, _pendingEncoding: '', parser: null, _httpMessage: ClientRequest { domain: null, _events: [Object], _eventsCount: 5, _maxListeners: undefined, output: [], outputEncodings: [], outputCallbacks: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedHeader: {}, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: [Circular], connection: [Circular], _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n', _headers: [Object], _headerNames: [Object], _onPendingData: null, agent: [Object], socketPath: undefined, method: 'GET', path: '/3.0.0//3.0-last-rst', timeoutCb: [Function: emitTimeout], parser: null, res: [Circular] }, _idleTimeout: -1, _idleNext: null, _idlePrev: null, _idleStart: 4136, read: [Function], _consuming: true, write: [Function: writeAfterFIN] }, httpVersionMajor: 1, httpVersionMinor: 1, httpVersion: '1.1', complete: true, headers: { server: 'nginx', date: 'Tue, 10 Nov 2015 09:58:11 GMT', 'content-type': 'text/html; charset=iso-8859-1', vary: 'Accept-Encoding', 'content-encoding': 'gzip', connection: 'close' }, rawHeaders: [ 'Server', 'nginx', 'Date', 'Tue, 10 Nov 2015 09:58:11 GMT', 'Content-Type', 'text/html; charset=iso-8859-1', 'Vary', 'Accept-Encoding', 'Content-Encoding', 'gzip', 'Connection', 'close' ], trailers: {}, rawTrailers: [], upgrade: false, url: '', method: null, statusCode: 404, statusMessage: 'Not Found', client: Socket { _connecting: false, _hadError: false, _handle: null, _parent: null, _host: 'docs.opencv.org', _readableState: ReadableState { objectMode: false, highWaterMark: 16384, buffer: [], length: 0, pipes: null, pipesCount: 0, flowing: true, ended: true, endEmitted: true, reading: false, sync: false, needReadable: false, emittedReadable: false, readableListening: false, defaultEncoding: 'utf8', ranOut: false, awaitDrain: 0, readingMore: false, decoder: null, encoding: null, resumeScheduled: false }, readable: false, domain: null, _events: { end: [Object], finish: [Function: onSocketFinish], _socketEnd: [Function: onSocketEnd], free: [Function: onFree], close: [Object], agentRemove: [Function: onRemove], drain: [Function: ondrain], error: [Object], data: [Function: socketOnData], timeout: [Object] }, _eventsCount: 10, _maxListeners: 0, _writableState: WritableState { objectMode: false, highWaterMark: 16384, needDrain: false, ending: true, ended: true, finished: true, decodeStrings: false, defaultEncoding: 'utf8', length: 0, writing: false, corked: 0, sync: false, bufferProcessing: false, onwrite: [Function], writecb: null, writelen: 0, bufferedRequest: null, lastBufferedRequest: null, pendingcb: 0, prefinished: true, errorEmitted: false }, writable: false, allowHalfOpen: false, destroyed: true, bytesRead: 383, _bytesDispatched: 292, _sockname: null, _pendingData: null, _pendingEncoding: '', parser: null, _httpMessage: ClientRequest { domain: null, _events: [Object], _eventsCount: 5, _maxListeners: undefined, output: [], outputEncodings: [], outputCallbacks: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedHeader: {}, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: [Circular], connection: [Circular], _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n', _headers: [Object], _headerNames: [Object], _onPendingData: null, agent: [Object], socketPath: undefined, method: 'GET', path: '/3.0.0//3.0-last-rst', timeoutCb: [Function: emitTimeout], parser: null, res: [Circular] }, _idleTimeout: -1, _idleNext: null, _idlePrev: null, _idleStart: 4136, read: [Function], _consuming: true, write: [Function: writeAfterFIN] }, _consuming: true, _dumped: false, req: ClientRequest { domain: null, _events: { socket: [Object], timeout: [Object], response: [Function: bound ], error: [Function: bound ], drain: [Function] }, _eventsCount: 5, _maxListeners: undefined, output: [], outputEncodings: [], outputCallbacks: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedHeader: {}, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: Socket { _connecting: false, _hadError: false, _handle: null, _parent: null, _host: 'docs.opencv.org', _readableState: [Object], readable: false, domain: null, _events: [Object], _eventsCount: 10, _maxListeners: 0, _writableState: [Object], writable: false, allowHalfOpen: false, destroyed: true, bytesRead: 383, _bytesDispatched: 292, _sockname: null, _pendingData: null, _pendingEncoding: '', parser: null, _httpMessage: [Circular], _idleTimeout: -1, _idleNext: null, _idlePrev: null, _idleStart: 4136, read: [Function], _consuming: true, write: [Function: writeAfterFIN] }, connection: Socket { _connecting: false, _hadError: false, _handle: null, _parent: null, _host: 'docs.opencv.org', _readableState: [Object], readable: false, domain: null, _events: [Object], _eventsCount: 10, _maxListeners: 0, _writableState: [Object], writable: false, allowHalfOpen: false, destroyed: true, bytesRead: 383, _bytesDispatched: 292, _sockname: null, _pendingData: null, _pendingEncoding: '', parser: null, _httpMessage: [Circular], _idleTimeout: -1, _idleNext: null, _idlePrev: null, _idleStart: 4136, read: [Function], _consuming: true, write: [Function: writeAfterFIN] }, _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n', _headers: { host: 'docs.opencv.org', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', referer: 'http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html', 'accept-encoding': 'gzip, deflate' }, _headerNames: { host: 'Host', 'user-agent': 'User-Agent', referer: 'Referer', 'accept-encoding': 'Accept-Encoding' }, _onPendingData: null, agent: Agent { domain: null, _events: [Object], _eventsCount: 1, _maxListeners: undefined, defaultPort: 80, protocol: 'http:', options: [Object], requests: {}, sockets: [Object], freeSockets: {}, keepAliveMsecs: 1000, keepAlive: false, maxSockets: Infinity, maxFreeSockets: 256 }, socketPath: undefined, method: 'GET', path: '/3.0.0//3.0-last-rst', timeoutCb: [Function: emitTimeout], parser: null, res: [Circular] }, request: Request { domain: null, _events: { error: [Function: bound ], complete: [Function: bound ], pipe: [Function], end: [Object], data: [Function] }, _eventsCount: 5, _maxListeners: undefined, callback: [Function], followRedirect: true, timeout: 30000, headers: { Host: 'docs.opencv.org', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', Referer: 'http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html', 'Accept-Encoding': 'gzip, deflate' }, encoding: null, method: 'GET', uri: Url { protocol: 'http:', slashes: true, auth: null, host: 'docs.opencv.org', port: 80, hostname: 'docs.opencv.org', hash: null, search: null, query: null, pathname: '/3.0.0//3.0-last-rst', path: '/3.0.0//3.0-last-rst', href: 'http://docs.opencv.org/3.0.0//3.0-last-rst' }, readable: true, writable: true, explicitMethod: true, _qs: Querystring { request: [Circular], lib: [Object], useQuerystring: undefined, parseOptions: {}, stringifyOptions: {} }, _auth: Auth { request: [Circular], hasAuth: false, sentAuth: false, bearerToken: null, user: null, pass: null }, _oauth: OAuth { request: [Circular], params: null }, _multipart: Multipart { request: [Circular], boundary: '03d2fef7-454d-4b17-85f7-74c8639b40e0', chunked: false, body: null }, _redirect: Redirect { request: [Circular], followRedirect: true, followRedirects: true, followAllRedirects: false, allowRedirect: [Function], maxRedirects: 10, redirects: [], redirectsFollowed: 0, removeRefererHeader: false }, _tunnel: Tunnel { request: [Circular], proxyHeaderWhiteList: [Object], proxyHeaderExclusiveList: [] }, setHeader: [Function], hasHeader: [Function], getHeader: [Function], removeHeader: [Function], localAddress: undefined, pool: {}, dests: [], __isRequestRequest: true, _callback: [Function: bound ], proxy: null, tunnel: undefined, setHost: false, originalCookieHeader: undefined, _jar: RequestJar { _jar: [Object] }, port: 80, host: 'docs.opencv.org', path: '/3.0.0//3.0-last-rst', httpModule: { IncomingMessage: [Object], METHODS: [Object], OutgoingMessage: [Object], ServerResponse: [Object], STATUS_CODES: [Object], Agent: [Object], globalAgent: [Object], ClientRequest: [Object], request: [Function], get: [Function], _connectionListener: [Function: connectionListener], Server: [Object], createServer: [Function], Client: [Function: deprecated], createClient: [Function: deprecated] }, agentClass: { [Function: Agent] super_: [Object], defaultMaxSockets: Infinity }, agent: Agent { domain: null, _events: [Object], _eventsCount: 1, _maxListeners: undefined, defaultPort: 80, protocol: 'http:', options: [Object], requests: {}, sockets: [Object], freeSockets: {}, keepAliveMsecs: 1000, keepAlive: false, maxSockets: Infinity, maxFreeSockets: 256 }, _started: true, href: 'http://docs.opencv.org/3.0.0//3.0-last-rst', req: ClientRequest { domain: null, _events: [Object], _eventsCount: 5, _maxListeners: undefined, output: [], outputEncodings: [], outputCallbacks: [], outputSize: 0, writable: true, _last: true, chunkedEncoding: false, shouldKeepAlive: false, useChunkedEncodingByDefault: false, sendDate: false, _removedHeader: {}, _contentLength: 0, _hasBody: true, _trailer: '', finished: true, _headerSent: true, socket: [Object], connection: [Object], _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n', _headers: [Object], _headerNames: [Object], _onPendingData: null, agent: [Object], socketPath: undefined, method: 'GET', path: '/3.0.0//3.0-last-rst', timeoutCb: [Function: emitTimeout], parser: null, res: [Circular] }, timeoutTimer: null, ntick: true, response: [Circular], originalHost: 'docs.opencv.org', originalHostHeaderName: 'Host', responseContent: [Circular], _destdata: true, _ended: true, _callbackCalled: true }, toJSON: [Function: responseToJSON], caseless: Caseless { dict: { server: 'nginx', date: 'Tue, 10 Nov 2015 09:58:11 GMT', 'content-type': 'text/html; charset=iso-8859-1', vary: 'Accept-Encoding', 'content-encoding': 'gzip', connection: 'close' } }, read: [Function], body: <Buffer 1f 8b 08 00 00 00 00 00 00 03 4c 8e cd 0e 82 30 10 84 ef 3c c5 ca 1d 16 95 63 d3 83 fc 44 12 44 62 ea c1 23 a6 35 25 41 8a 6d d1 f8 f6 b6 70 f1 b4 99 ... >, cookies: {}今回はaタグのみで、imgタグやlinkタグ、scriptタグをたどっていないのでレイアウトは崩れて画像も表示できていなかった。 ちょっとタイムアウトが気持ち悪いけど、とりあえず再帰的に特定のドメイン以下のページを取得することはできた。
今日の作業
髪の毛切りに行ったりしてたら、今日はこれしかできなかった。。。
0 件のコメント:
コメントを投稿