

[h1]Websocket数据抓取[/h1]
通过Chrome的开发者工具Network,筛选ws,找到websocket的连接,进行第一步 分析。
待续…………
[h1]常规爬取[/h1]
这个都懂,咱就不说了,有些反爬验证实在是反人类哈。
[h1]通过油猴添加自定义JS操作网页[/h1]
这个好用确实好用:真实的浏览器环境、真实的操作…….. 就是效率确实不敢恭维。
因为一切都在真实的环境和网站上进行,很多常规的爬虫操作都不用考虑,只要考虑以下几点:
- 如何减少内存消耗,提升效率;
- 如何通过js ajax把获取到的数据发送到服务端或者输出到指定点;
[h2]1.能干嘛?[/h2]
- 实时获取网页数据。(监测支付宝订单…)
- 通过iframe抓取渲染后的网页数据。
- 操作网页事件,模拟用户行为。
- websocket数据监测。
- ajax获取静态页面数据。
[h2]2.已经遇到并解决的问题[/h2]
- 获取iframe元素DOM的正确方法:https://www.jianshu.com/p/c622ee151d5c。
- Jquery释放内存的问题:remove()、empty()是不会释放占用的内存的。
/*jq内存清理函数*/$.fn.removeWithLeakage = function() {this.each(function(i,e) {$("*", e).add([e]).each(function(){$.event.remove(this);$.removeData(this);});if (e.parentNode)e.parentNode.removeChild(e);});};
/*jq内存清理函数*/ $.fn.removeWithLeakage = function() { this.each(function(i,e) { $("*", e).add([e]).each(function(){ $.event.remove(this); $.removeData(this); }); if (e.parentNode) e.parentNode.removeChild(e); }); };
/*jq内存清理函数*/ $.fn.removeWithLeakage = function() { this.each(function(i,e) { $("*", e).add([e]).each(function(){ $.event.remove(this); $.removeData(this); }); if (e.parentNode) e.parentNode.removeChild(e); }); };
- setTimeout、setInterval定时器要手动释放,不然会一直保持。
- JS错误处理只支持一个catch块,捕获通用错误类型后再通过判断错误类型进行处理。
try {someFunction();} catch (error) {if (error instanceof TypeError) {//处理类型错误} else if (error instanceof ReferenceError) {//处理引用错误} else {//处理其他类型的错误}}
try { someFunction(); } catch (error) { if (error instanceof TypeError) { //处理类型错误 } else if (error instanceof ReferenceError) { //处理引用错误 } else { //处理其他类型的错误 } }
try { someFunction(); } catch (error) { if (error instanceof TypeError) { //处理类型错误 } else if (error instanceof ReferenceError) { //处理引用错误 } else { //处理其他类型的错误 } }
[h1]三、重载XHR监听全局的AjAx请求[/h1]
;(function () {function ajaxEventTrigger(event) {var ajaxEvent = new CustomEvent(event, {detail: this});window.dispatchEvent(ajaxEvent);}var oldXHR = window.XMLHttpRequest;function newXHR() {var realXHR = new oldXHR();realXHR.addEventListener('abort', function () {ajaxEventTrigger.call(this, 'ajaxAbort');console.log("a");}, false);realXHR.addEventListener('error', function () {ajaxEventTrigger.call(this, 'ajaxError');console.log("b");}, false);realXHR.addEventListener('load', function () {ajaxEventTrigger.call(this, 'ajaxLoad');console.log("c");}, false);realXHR.addEventListener('loadstart', function () {ajaxEventTrigger.call(this, 'ajaxLoadStart');console.log("d");}, false);realXHR.addEventListener('progress', function () {ajaxEventTrigger.call(this, 'ajaxProgress');console.log("e");}, false);realXHR.addEventListener('timeout', function () {ajaxEventTrigger.call(this, 'ajaxTimeout');console.log("f");}, false);realXHR.addEventListener('loadend', function () {ajaxEventTrigger.call(this, 'ajaxLoadEnd');console.log("g");}, false);realXHR.addEventListener('readystatechange', function () {console.log(this);ajaxEventTrigger.call(this, 'ajaxReadyStateChange');console.log("h");}, false);return realXHR;}window.XMLHttpRequest = newXHR;})();;(function () { function ajaxEventTrigger(event) { var ajaxEvent = new CustomEvent(event, {detail: this}); window.dispatchEvent(ajaxEvent); } var oldXHR = window.XMLHttpRequest; function newXHR() { var realXHR = new oldXHR(); realXHR.addEventListener('abort', function () { ajaxEventTrigger.call(this, 'ajaxAbort'); console.log("a"); }, false); realXHR.addEventListener('error', function () { ajaxEventTrigger.call(this, 'ajaxError'); console.log("b"); }, false); realXHR.addEventListener('load', function () { ajaxEventTrigger.call(this, 'ajaxLoad'); console.log("c"); }, false); realXHR.addEventListener('loadstart', function () { ajaxEventTrigger.call(this, 'ajaxLoadStart'); console.log("d"); }, false); realXHR.addEventListener('progress', function () { ajaxEventTrigger.call(this, 'ajaxProgress'); console.log("e"); }, false); realXHR.addEventListener('timeout', function () { ajaxEventTrigger.call(this, 'ajaxTimeout'); console.log("f"); }, false); realXHR.addEventListener('loadend', function () { ajaxEventTrigger.call(this, 'ajaxLoadEnd'); console.log("g"); }, false); realXHR.addEventListener('readystatechange', function () { console.log(this); ajaxEventTrigger.call(this, 'ajaxReadyStateChange'); console.log("h"); }, false); return realXHR; } window.XMLHttpRequest = newXHR; })();;(function () { function ajaxEventTrigger(event) { var ajaxEvent = new CustomEvent(event, {detail: this}); window.dispatchEvent(ajaxEvent); } var oldXHR = window.XMLHttpRequest; function newXHR() { var realXHR = new oldXHR(); realXHR.addEventListener('abort', function () { ajaxEventTrigger.call(this, 'ajaxAbort'); console.log("a"); }, false); realXHR.addEventListener('error', function () { ajaxEventTrigger.call(this, 'ajaxError'); console.log("b"); }, false); realXHR.addEventListener('load', function () { ajaxEventTrigger.call(this, 'ajaxLoad'); console.log("c"); }, false); realXHR.addEventListener('loadstart', function () { ajaxEventTrigger.call(this, 'ajaxLoadStart'); console.log("d"); }, false); realXHR.addEventListener('progress', function () { ajaxEventTrigger.call(this, 'ajaxProgress'); console.log("e"); }, false); realXHR.addEventListener('timeout', function () { ajaxEventTrigger.call(this, 'ajaxTimeout'); console.log("f"); }, false); realXHR.addEventListener('loadend', function () { ajaxEventTrigger.call(this, 'ajaxLoadEnd'); console.log("g"); }, false); realXHR.addEventListener('readystatechange', function () { console.log(this); ajaxEventTrigger.call(this, 'ajaxReadyStateChange'); console.log("h"); }, false); return realXHR; } window.XMLHttpRequest = newXHR; })();
[h2]fetch请求拦截[/h2]
const {fetch: origFetch} = window; //这么简单window.fetch = async (...args) => {console.log("fetch called with args:", args);const response = await origFetch(...args);/* work with the cloned response in a separate promisechain -- could use the same chain with `await`. */response.clone().json().then(body => console.log("intercepted response:", body)).catch(err => console.error(err));/* the original response can be resolved unmodified: */return response;};const {fetch: origFetch} = window; //这么简单 window.fetch = async (...args) => { console.log("fetch called with args:", args); const response = await origFetch(...args); /* work with the cloned response in a separate promise chain -- could use the same chain with `await`. */ response .clone() .json() .then(body => console.log("intercepted response:", body)) .catch(err => console.error(err)); /* the original response can be resolved unmodified: */ return response; };const {fetch: origFetch} = window; //这么简单 window.fetch = async (...args) => { console.log("fetch called with args:", args); const response = await origFetch(...args); /* work with the cloned response in a separate promise chain -- could use the same chain with `await`. */ response .clone() .json() .then(body => console.log("intercepted response:", body)) .catch(err => console.error(err)); /* the original response can be resolved unmodified: */ return response; };
[h1]四、文件下载[/h1]
function saveFile() {var data = document.querySelector('#text').value;/* var data = '自定义的数据内容,可以是服务端返回滴!';*/var name = 'cdk.txt';exportRaw(data, name);}function exportRaw(data, name) {var urlObject = window.URL || window.webkitURL || window;var export_blob = new Blob([data]);var save_link = document.createElementNS("http://www.w3.org/1999/xhtml", "a")save_link.href = urlObject.createObjectURL(export_blob);save_link.download = name;save_link.click();}function saveFile() { var data = document.querySelector('#text').value; /* var data = '自定义的数据内容,可以是服务端返回滴!';*/ var name = 'cdk.txt'; exportRaw(data, name); } function exportRaw(data, name) { var urlObject = window.URL || window.webkitURL || window; var export_blob = new Blob([data]); var save_link = document.createElementNS("http://www.w3.org/1999/xhtml", "a") save_link.href = urlObject.createObjectURL(export_blob); save_link.download = name; save_link.click(); }function saveFile() { var data = document.querySelector('#text').value; /* var data = '自定义的数据内容,可以是服务端返回滴!';*/ var name = 'cdk.txt'; exportRaw(data, name); } function exportRaw(data, name) { var urlObject = window.URL || window.webkitURL || window; var export_blob = new Blob([data]); var save_link = document.createElementNS("http://www.w3.org/1999/xhtml", "a") save_link.href = urlObject.createObjectURL(export_blob); save_link.download = name; save_link.click(); }
[h1]爬虫经验总结[/h1]
- 人工过验证,然后保存cookie,模拟请求头。
- 跟浏览器同样的请求头,发送请求失败时,可以抓一下自己程序的包,这样很容易看出问题在哪。
[h1]油猴爬虫脚本编写记录[/h1]
- 避免无限互相回调,会导致内存无限增加。
- 避免大量的console.log,log对象时会一直存在内存,导致内存无限增加。
- 可以使用window.open 时打开新的窗口,clsoe关闭后会自动释放内存。
- 使用async、await 、Promise强行配合网页的加载过程,同步爬行。
- 使用打包工具编写,通过PHP输出module模块,插入浏览器加载即可。
[h1]Vue、React模拟输入[/h1]
Vue:
let input = $("input").get(0);let lastValue = input.valueinput.value='123'let event = new Event('input', { bubbles: true })let tracker = input._valueTrackerif(tracker) {tracker.setValue(lastValue)}input.dispatchEvent(event)let input = $("input").get(0); let lastValue = input.value input.value='123' let event = new Event('input', { bubbles: true }) let tracker = input._valueTracker if(tracker) { tracker.setValue(lastValue) } input.dispatchEvent(event)let input = $("input").get(0); let lastValue = input.value input.value='123' let event = new Event('input', { bubbles: true }) let tracker = input._valueTracker if(tracker) { tracker.setValue(lastValue) } input.dispatchEvent(event)
React15以下:
let casess=$("input").get(0)casess.value='123'var event = document.createEvent('HTMLEvents')event.initEvent('input', true, true)event.eventType = 'message'casess.dispatchEvent(event)let casess=$("input").get(0) casess.value='123' var event = document.createEvent('HTMLEvents') event.initEvent('input', true, true) event.eventType = 'message' casess.dispatchEvent(event)let casess=$("input").get(0) casess.value='123' var event = document.createEvent('HTMLEvents') event.initEvent('input', true, true) event.eventType = 'message' casess.dispatchEvent(event)
React15以上:
let dom = $('[placeholder="请输入完整商品ID"]');let old = dom[0].value;dom[0].value = text;let event = new Event("input", {bubbles: true});// React15event.simulated = true;let tracker = dom[0]._valueTracker;if (tracker) {tracker.setValue(old);}dom[0].dispatchEvent(event);let dom = $('[placeholder="请输入完整商品ID"]'); let old = dom[0].value; dom[0].value = text; let event = new Event("input", {bubbles: true}); // React15 event.simulated = true; let tracker = dom[0]._valueTracker; if (tracker) { tracker.setValue(old); } dom[0].dispatchEvent(event);let dom = $('[placeholder="请输入完整商品ID"]'); let old = dom[0].value; dom[0].value = text; let event = new Event("input", {bubbles: true}); // React15 event.simulated = true; let tracker = dom[0]._valueTracker; if (tracker) { tracker.setValue(old); } dom[0].dispatchEvent(event);
© 版权声明
THE END
暂无评论内容