2015年11月10日火曜日

特定サイトをクローリングする

このエントリーをはてなブックマークに追加
ある特定のドメインのサイトを一定階層までダウンロードしてみる。

/**
 * メイン処理
 */

// ライブラリ読み込み
var client = require('cheerio-httpcli');
var request = require('request');
var URL = require('url');
var fs = require('fs');
var path = require('path');

// 読み込み階層を今回は3までに指定
var LINK_LEVEL = 3;
// ダウンロード先
var TARGET_URL = "http://docs.opencv.org/3.0.0/";
var list = {};

// ダウンロード関数実行
downloadRec(TARGET_URL, 0);

/*
 * 関数定義
 */
function downloadRec(url, level){
 // 読み込み階層を超えてたら実行終了
 if( level >= LINK_LEVEL){
  return;
 }
 
 // listに追加されたurlがTRUE(ダウンロード済み)であれば抜ける
 if(list[url]){
  return;
 }
 
 list[url] = true;
 
 // 外部ドメインは無視する
 var us = TARGET_URL.split("/");
 us.pop();
 var base = us.join("/");
 if(url.indexOf(base) < 0){
  return;
 }
 
 client.fetch(url, {}, function(err, $, res){
  // エラーが返されたら終了
  if(err){
         console.log("Error", err);
         console.log("RESPONCE", res);
  }
  // aタグをすべてチェック
  $("a").each(function(idx){
   var href = $(this).attr("href");
   // リンク先がなければ終了
   if(!href){
    return;
   }
   
   // 相対パスを絶対パスへ変更
   href = URL.resolve(url, href);
   
   // アンカーリンクは無視
   href = href.replace(/\#.+$/, "");
   
   // リンク先のURLに対して再帰的に実行
   downloadRec(href, level + 1);
  });
  
  // URLの最後が’/’だけだったらindex.htmlを足す
  if(url.substr(url.length-1, 1) == "/"){
   url += "index.html";
  }
  
  // 3層までなのでURLを一旦’/’で分割し、最後から2つの要素を取り出し、再度’/’をつけてディレクトリパスにする。
  var savepath = url.split("/").slice(2).join("/");
  checkSaveDir(savepath);
  console.log(savepath);
  fs.writeFileSync(savepath, $.html());
  
 });
}

// 該当のディレクトリがあるかどうかチェック
function checkSaveDir(fname){
 var dir = path.dirname(fname);
 
 var dirlist = dir.split("/");
 var p = "";
 
 for(var i in dirlist){
  p += dirlist[i] + "/";
  if(!fs.existsSync(p)){
   fs.mkdirSync(p);
  }
 }
}
順調に行ってたら、このメッセージが出た。エラー処理を入れて確認してみる。
getall.js:37 $("a").each(function(idx){ ^ TypeError: $ is not a function     at Object.callback (/root/WebCrawler-NetAgent/ch02/getall.js:37:3)     at Object.module.exports.fail (/root/node_modules/cheerio-httpcli/lib/client.js:59:15)     at Object.<anonymous> (/root/node_modules/cheerio-httpcli/lib/client.js:191:21)     at Object.<anonymous> (/root/node_modules/cheerio-httpcli/lib/client.js:129:16)     at self.callback (/root/node_modules/cheerio-httpcli/node_modules/request/request.js:198:22)     at emitOne (events.js:77:13)     at Request.emit (events.js:169:7)     at null._onTimeout (/root/node_modules/cheerio-httpcli/node_modules/request/request.js:811:12)     at Timer.listOnTimeout (timers.js:92:15)
client.fetchの一番上にエラー処理を入れてみた。
:
:
 client.fetch(url, {}, function(err, $, res){
  if(err){
   console.log("Error", err);
   console.log("RESPONCE", res);
   return;
  }
  
  $("a").each(function(idx){
   var href = $(this).attr("href");
   if(!href){
    return;
   }
:
:
すると、約300ページほどタイムアウトで取得できていないことがわかった。ただURLを直接ブラウザで確認すると問題なさそうなので、なんでかなと思いつつ一旦スルー。ちなみにこんなエラー。
Error { [Error: ETIMEDOUT]   code: 'ETIMEDOUT',   connect: true,   url: 'http://docs.opencv.org/3.0.0/dd/de2/classcv_1_1AutoLock.html',   param: {} } RESPONCE undefined Error { [Error: ETIMEDOUT]   code: 'ETIMEDOUT',   connect: true,   url: 'http://docs.opencv.org/3.0.0/d7/d7b/classcv_1_1BackgroundSubtractorMOG2.html',   param: {} } RESPONCE undefined Error { [Error: ETIMEDOUT]   code: 'ETIMEDOUT',   connect: true,   url: 'http://docs.opencv.org/3.0.0/d0/d2e/classcv_1_1CommandLineParser.html',   param: {} }
こんなのが300個ほど。(正確には332個だった) 取得したのを確認してみるとこんな感じ。 あと404も1つあった。これは実際のサイトのURLをたどっても404だったのでプログラム的には問題なし。
Error { [Error: server status]   url: 'http://docs.opencv.org/3.0.0//3.0-last-rst',   param: {},   statusCode: 404 } RESPONCE IncomingMessage {   _readableState:    ReadableState {      objectMode: false,      highWaterMark: 16384,      buffer: [],      length: 0,      pipes: null,      pipesCount: 0,      flowing: true,      ended: true,      endEmitted: true,      reading: false,      sync: false,      needReadable: false,      emittedReadable: false,      readableListening: false,      defaultEncoding: 'utf8',      ranOut: false,      awaitDrain: 0,      readingMore: false,      decoder: null,      encoding: null,      resumeScheduled: false },   readable: false,   domain: null,   _events:    { end: [ [Function: responseOnEnd], [Function], [Function], [Function] ],      close: [ [Function], [Function] ],      data: [Function],      error: [Function] },   _eventsCount: 4,   _maxListeners: undefined,   socket:    Socket {      _connecting: false,      _hadError: false,      _handle: null,      _parent: null,      _host: 'docs.opencv.org',      _readableState:       ReadableState {         objectMode: false,         highWaterMark: 16384,         buffer: [],         length: 0,         pipes: null,         pipesCount: 0,         flowing: true,         ended: true,         endEmitted: true,         reading: false,         sync: false,         needReadable: false,         emittedReadable: false,         readableListening: false,         defaultEncoding: 'utf8',         ranOut: false,         awaitDrain: 0,         readingMore: false,         decoder: null,         encoding: null,         resumeScheduled: false },      readable: false,      domain: null,      _events:       { end: [Object],         finish: [Function: onSocketFinish],         _socketEnd: [Function: onSocketEnd],         free: [Function: onFree],         close: [Object],         agentRemove: [Function: onRemove],         drain: [Function: ondrain],         error: [Object],         data: [Function: socketOnData],         timeout: [Object] },      _eventsCount: 10,      _maxListeners: 0,      _writableState:       WritableState {         objectMode: false,         highWaterMark: 16384,         needDrain: false,         ending: true,         ended: true,         finished: true,         decodeStrings: false,         defaultEncoding: 'utf8',         length: 0,         writing: false,         corked: 0,         sync: false,         bufferProcessing: false,         onwrite: [Function],         writecb: null,         writelen: 0,         bufferedRequest: null,         lastBufferedRequest: null,         pendingcb: 0,         prefinished: true,         errorEmitted: false },      writable: false,      allowHalfOpen: false,      destroyed: true,      bytesRead: 383,      _bytesDispatched: 292,      _sockname: null,      _pendingData: null,      _pendingEncoding: '',      parser: null,      _httpMessage:       ClientRequest {         domain: null,         _events: [Object],         _eventsCount: 5,         _maxListeners: undefined,         output: [],         outputEncodings: [],         outputCallbacks: [],         outputSize: 0,         writable: true,         _last: true,         chunkedEncoding: false,         shouldKeepAlive: false,         useChunkedEncodingByDefault: false,         sendDate: false,         _removedHeader: {},         _contentLength: 0,         _hasBody: true,         _trailer: '',         finished: true,         _headerSent: true,         socket: [Circular],         connection: [Circular],         _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n',         _headers: [Object],         _headerNames: [Object],         _onPendingData: null,         agent: [Object],         socketPath: undefined,         method: 'GET',         path: '/3.0.0//3.0-last-rst',         timeoutCb: [Function: emitTimeout],         parser: null,         res: [Circular] },      _idleTimeout: -1,      _idleNext: null,      _idlePrev: null,      _idleStart: 4136,      read: [Function],      _consuming: true,      write: [Function: writeAfterFIN] },   connection:    Socket {      _connecting: false,      _hadError: false,      _handle: null,      _parent: null,      _host: 'docs.opencv.org',      _readableState:       ReadableState {         objectMode: false,         highWaterMark: 16384,         buffer: [],         length: 0,         pipes: null,         pipesCount: 0,         flowing: true,         ended: true,         endEmitted: true,         reading: false,         sync: false,         needReadable: false,         emittedReadable: false,         readableListening: false,         defaultEncoding: 'utf8',         ranOut: false,         awaitDrain: 0,         readingMore: false,         decoder: null,         encoding: null,         resumeScheduled: false },      readable: false,      domain: null,      _events:       { end: [Object],         finish: [Function: onSocketFinish],         _socketEnd: [Function: onSocketEnd],         free: [Function: onFree],         close: [Object],         agentRemove: [Function: onRemove],         drain: [Function: ondrain],         error: [Object],         data: [Function: socketOnData],         timeout: [Object] },      _eventsCount: 10,      _maxListeners: 0,      _writableState:       WritableState {         objectMode: false,         highWaterMark: 16384,         needDrain: false,         ending: true,         ended: true,         finished: true,         decodeStrings: false,         defaultEncoding: 'utf8',         length: 0,         writing: false,         corked: 0,         sync: false,         bufferProcessing: false,         onwrite: [Function],         writecb: null,         writelen: 0,         bufferedRequest: null,         lastBufferedRequest: null,         pendingcb: 0,         prefinished: true,         errorEmitted: false },      writable: false,      allowHalfOpen: false,      destroyed: true,      bytesRead: 383,      _bytesDispatched: 292,      _sockname: null,      _pendingData: null,      _pendingEncoding: '',      parser: null,      _httpMessage:       ClientRequest {         domain: null,         _events: [Object],         _eventsCount: 5,         _maxListeners: undefined,         output: [],         outputEncodings: [],         outputCallbacks: [],         outputSize: 0,         writable: true,         _last: true,         chunkedEncoding: false,         shouldKeepAlive: false,         useChunkedEncodingByDefault: false,         sendDate: false,         _removedHeader: {},         _contentLength: 0,         _hasBody: true,         _trailer: '',         finished: true,         _headerSent: true,         socket: [Circular],         connection: [Circular],         _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n',         _headers: [Object],         _headerNames: [Object],         _onPendingData: null,         agent: [Object],         socketPath: undefined,         method: 'GET',         path: '/3.0.0//3.0-last-rst',         timeoutCb: [Function: emitTimeout],         parser: null,         res: [Circular] },      _idleTimeout: -1,      _idleNext: null,      _idlePrev: null,      _idleStart: 4136,      read: [Function],      _consuming: true,      write: [Function: writeAfterFIN] },   httpVersionMajor: 1,   httpVersionMinor: 1,   httpVersion: '1.1',   complete: true,   headers:    { server: 'nginx',      date: 'Tue, 10 Nov 2015 09:58:11 GMT',      'content-type': 'text/html; charset=iso-8859-1',      vary: 'Accept-Encoding',      'content-encoding': 'gzip',      connection: 'close' },   rawHeaders:    [ 'Server',      'nginx',      'Date',      'Tue, 10 Nov 2015 09:58:11 GMT',      'Content-Type',      'text/html; charset=iso-8859-1',      'Vary',      'Accept-Encoding',      'Content-Encoding',      'gzip',      'Connection',      'close' ],   trailers: {},   rawTrailers: [],   upgrade: false,   url: '',   method: null,   statusCode: 404,   statusMessage: 'Not Found',   client:    Socket {      _connecting: false,      _hadError: false,      _handle: null,      _parent: null,      _host: 'docs.opencv.org',      _readableState:       ReadableState {         objectMode: false,         highWaterMark: 16384,         buffer: [],         length: 0,         pipes: null,         pipesCount: 0,         flowing: true,         ended: true,         endEmitted: true,         reading: false,         sync: false,         needReadable: false,         emittedReadable: false,         readableListening: false,         defaultEncoding: 'utf8',         ranOut: false,         awaitDrain: 0,         readingMore: false,         decoder: null,         encoding: null,         resumeScheduled: false },      readable: false,      domain: null,      _events:       { end: [Object],         finish: [Function: onSocketFinish],         _socketEnd: [Function: onSocketEnd],         free: [Function: onFree],         close: [Object],         agentRemove: [Function: onRemove],         drain: [Function: ondrain],         error: [Object],         data: [Function: socketOnData],         timeout: [Object] },      _eventsCount: 10,      _maxListeners: 0,      _writableState:       WritableState {         objectMode: false,         highWaterMark: 16384,         needDrain: false,         ending: true,         ended: true,         finished: true,         decodeStrings: false,         defaultEncoding: 'utf8',         length: 0,         writing: false,         corked: 0,         sync: false,         bufferProcessing: false,         onwrite: [Function],         writecb: null,         writelen: 0,         bufferedRequest: null,         lastBufferedRequest: null,         pendingcb: 0,         prefinished: true,         errorEmitted: false },      writable: false,      allowHalfOpen: false,      destroyed: true,      bytesRead: 383,      _bytesDispatched: 292,      _sockname: null,      _pendingData: null,      _pendingEncoding: '',      parser: null,      _httpMessage:       ClientRequest {         domain: null,         _events: [Object],         _eventsCount: 5,         _maxListeners: undefined,         output: [],         outputEncodings: [],         outputCallbacks: [],         outputSize: 0,         writable: true,         _last: true,         chunkedEncoding: false,         shouldKeepAlive: false,         useChunkedEncodingByDefault: false,         sendDate: false,         _removedHeader: {},         _contentLength: 0,         _hasBody: true,         _trailer: '',         finished: true,         _headerSent: true,         socket: [Circular],         connection: [Circular],         _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n',         _headers: [Object],         _headerNames: [Object],         _onPendingData: null,         agent: [Object],         socketPath: undefined,         method: 'GET',         path: '/3.0.0//3.0-last-rst',         timeoutCb: [Function: emitTimeout],         parser: null,         res: [Circular] },      _idleTimeout: -1,      _idleNext: null,      _idlePrev: null,      _idleStart: 4136,      read: [Function],      _consuming: true,      write: [Function: writeAfterFIN] },   _consuming: true,   _dumped: false,   req:    ClientRequest {      domain: null,      _events:       { socket: [Object],         timeout: [Object],         response: [Function: bound ],         error: [Function: bound ],         drain: [Function] },      _eventsCount: 5,      _maxListeners: undefined,      output: [],      outputEncodings: [],      outputCallbacks: [],      outputSize: 0,      writable: true,      _last: true,      chunkedEncoding: false,      shouldKeepAlive: false,      useChunkedEncodingByDefault: false,      sendDate: false,      _removedHeader: {},      _contentLength: 0,      _hasBody: true,      _trailer: '',      finished: true,      _headerSent: true,      socket:       Socket {         _connecting: false,         _hadError: false,         _handle: null,         _parent: null,         _host: 'docs.opencv.org',         _readableState: [Object],         readable: false,         domain: null,         _events: [Object],         _eventsCount: 10,         _maxListeners: 0,         _writableState: [Object],         writable: false,         allowHalfOpen: false,         destroyed: true,         bytesRead: 383,         _bytesDispatched: 292,         _sockname: null,         _pendingData: null,         _pendingEncoding: '',         parser: null,         _httpMessage: [Circular],         _idleTimeout: -1,         _idleNext: null,         _idlePrev: null,         _idleStart: 4136,         read: [Function],         _consuming: true,         write: [Function: writeAfterFIN] },      connection:       Socket {         _connecting: false,         _hadError: false,         _handle: null,         _parent: null,         _host: 'docs.opencv.org',         _readableState: [Object],         readable: false,         domain: null,         _events: [Object],         _eventsCount: 10,         _maxListeners: 0,         _writableState: [Object],         writable: false,         allowHalfOpen: false,         destroyed: true,         bytesRead: 383,         _bytesDispatched: 292,         _sockname: null,         _pendingData: null,         _pendingEncoding: '',         parser: null,         _httpMessage: [Circular],         _idleTimeout: -1,         _idleNext: null,         _idlePrev: null,         _idleStart: 4136,         read: [Function],         _consuming: true,         write: [Function: writeAfterFIN] },      _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n',      _headers:       { host: 'docs.opencv.org',         'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',         referer: 'http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html',         'accept-encoding': 'gzip, deflate' },      _headerNames:       { host: 'Host',         'user-agent': 'User-Agent',         referer: 'Referer',         'accept-encoding': 'Accept-Encoding' },      _onPendingData: null,      agent:       Agent {         domain: null,         _events: [Object],         _eventsCount: 1,         _maxListeners: undefined,         defaultPort: 80,         protocol: 'http:',         options: [Object],         requests: {},         sockets: [Object],         freeSockets: {},         keepAliveMsecs: 1000,         keepAlive: false,         maxSockets: Infinity,         maxFreeSockets: 256 },      socketPath: undefined,      method: 'GET',      path: '/3.0.0//3.0-last-rst',      timeoutCb: [Function: emitTimeout],      parser: null,      res: [Circular] },   request:    Request {      domain: null,      _events:       { error: [Function: bound ],         complete: [Function: bound ],         pipe: [Function],         end: [Object],         data: [Function] },      _eventsCount: 5,      _maxListeners: undefined,      callback: [Function],      followRedirect: true,      timeout: 30000,      headers:       { Host: 'docs.opencv.org',         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',         Referer: 'http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html',         'Accept-Encoding': 'gzip, deflate' },      encoding: null,      method: 'GET',      uri:       Url {         protocol: 'http:',         slashes: true,         auth: null,         host: 'docs.opencv.org',         port: 80,         hostname: 'docs.opencv.org',         hash: null,         search: null,         query: null,         pathname: '/3.0.0//3.0-last-rst',         path: '/3.0.0//3.0-last-rst',         href: 'http://docs.opencv.org/3.0.0//3.0-last-rst' },      readable: true,      writable: true,      explicitMethod: true,      _qs:       Querystring {         request: [Circular],         lib: [Object],         useQuerystring: undefined,         parseOptions: {},         stringifyOptions: {} },      _auth:       Auth {         request: [Circular],         hasAuth: false,         sentAuth: false,         bearerToken: null,         user: null,         pass: null },      _oauth: OAuth { request: [Circular], params: null },      _multipart:       Multipart {         request: [Circular],         boundary: '03d2fef7-454d-4b17-85f7-74c8639b40e0',         chunked: false,         body: null },      _redirect:       Redirect {         request: [Circular],         followRedirect: true,         followRedirects: true,         followAllRedirects: false,         allowRedirect: [Function],         maxRedirects: 10,         redirects: [],         redirectsFollowed: 0,         removeRefererHeader: false },      _tunnel:       Tunnel {         request: [Circular],         proxyHeaderWhiteList: [Object],         proxyHeaderExclusiveList: [] },      setHeader: [Function],      hasHeader: [Function],      getHeader: [Function],      removeHeader: [Function],      localAddress: undefined,      pool: {},      dests: [],      __isRequestRequest: true,      _callback: [Function: bound ],      proxy: null,      tunnel: undefined,      setHost: false,      originalCookieHeader: undefined,      _jar: RequestJar { _jar: [Object] },      port: 80,      host: 'docs.opencv.org',      path: '/3.0.0//3.0-last-rst',      httpModule:       { IncomingMessage: [Object],         METHODS: [Object],         OutgoingMessage: [Object],         ServerResponse: [Object],         STATUS_CODES: [Object],         Agent: [Object],         globalAgent: [Object],         ClientRequest: [Object],         request: [Function],         get: [Function],         _connectionListener: [Function: connectionListener],         Server: [Object],         createServer: [Function],         Client: [Function: deprecated],         createClient: [Function: deprecated] },      agentClass: { [Function: Agent] super_: [Object], defaultMaxSockets: Infinity },      agent:       Agent {         domain: null,         _events: [Object],         _eventsCount: 1,         _maxListeners: undefined,         defaultPort: 80,         protocol: 'http:',         options: [Object],         requests: {},         sockets: [Object],         freeSockets: {},         keepAliveMsecs: 1000,         keepAlive: false,         maxSockets: Infinity,         maxFreeSockets: 256 },      _started: true,      href: 'http://docs.opencv.org/3.0.0//3.0-last-rst',      req:       ClientRequest {         domain: null,         _events: [Object],         _eventsCount: 5,         _maxListeners: undefined,         output: [],         outputEncodings: [],         outputCallbacks: [],         outputSize: 0,         writable: true,         _last: true,         chunkedEncoding: false,         shouldKeepAlive: false,         useChunkedEncodingByDefault: false,         sendDate: false,         _removedHeader: {},         _contentLength: 0,         _hasBody: true,         _trailer: '',         finished: true,         _headerSent: true,         socket: [Object],         connection: [Object],         _header: 'GET /3.0.0//3.0-last-rst HTTP/1.1\r\nHost: docs.opencv.org\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36\r\nReferer: http://docs.opencv.org/3.0.0/d9/df8/tutorial_root.html\r\nAccept-Encoding: gzip, deflate\r\nConnection: close\r\n\r\n',         _headers: [Object],         _headerNames: [Object],         _onPendingData: null,         agent: [Object],         socketPath: undefined,         method: 'GET',         path: '/3.0.0//3.0-last-rst',         timeoutCb: [Function: emitTimeout],         parser: null,         res: [Circular] },      timeoutTimer: null,      ntick: true,      response: [Circular],      originalHost: 'docs.opencv.org',      originalHostHeaderName: 'Host',      responseContent: [Circular],      _destdata: true,      _ended: true,      _callbackCalled: true },   toJSON: [Function: responseToJSON],   caseless:    Caseless {      dict:       { server: 'nginx',         date: 'Tue, 10 Nov 2015 09:58:11 GMT',         'content-type': 'text/html; charset=iso-8859-1',         vary: 'Accept-Encoding',         'content-encoding': 'gzip',         connection: 'close' } },   read: [Function],   body: <Buffer 1f 8b 08 00 00 00 00 00 00 03 4c 8e cd 0e 82 30 10 84 ef 3c c5 ca 1d 16 95 63 d3 83 fc 44 12 44 62 ea c1 23 a6 35 25 41 8a 6d d1 f8 f6 b6 70 f1 b4 99 ... >,   cookies: {}
今回はaタグのみで、imgタグやlinkタグ、scriptタグをたどっていないのでレイアウトは崩れて画像も表示できていなかった。
ちょっとタイムアウトが気持ち悪いけど、とりあえず再帰的に特定のドメイン以下のページを取得することはできた。
今日の作業

髪の毛切りに行ったりしてたら、今日はこれしかできなかった。。。

0 件のコメント:

コメントを投稿