最近一直在做数据采集的事情,目的是使用java开发一套分析指定采集规则,模拟用户动作做数据提取。 因此定义了一套动作脚本,open,click,get,list,opentab,closetab。。。 java解析脚本,调用phantomjs做数据提取,生成数据json文件,对外提供数据接口。 采集引擎终于写的差不多了,虽然还有很多问题需要修改,但是终于不用加班了,嘿嘿嘿。-------jstarseven
码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html 言归正传,由于一直搞这些东西,突然想着拿js去写个采集玩一玩,就用tampermonkey,毕竟好久没玩了。
简介:针对一些网站的数据列表,定义采集脚本,模拟用户操作,做列表数据提取,生成json数据格式化展示。
json采集脚本定义:
1 {
2 "type": "list",
3 "selector": "",//列表选择器
4 "max_page": 1,//采集页数
5 "page_selector": "",//翻页选择器
6 "iframe_selector": "",//iframe 选择器
7 "datas": [//采集字段定义
8 {
9 "selector": " ",//字段选择器<此处为针对列表的子选择器>
10 "column": "title",//字段名称
11 "from": "text",//采集类型
12 "iframe_selector": "",//iframe选择器 防止一些网站怪异 一般不需要
13 "open_tab": [//当前字段开新标签做采集
14 {
15 "selector": " ",//新标签字段选择器
16 "column": " ",
17 "from": "text",
18 "iframe_selector": ""
19 },
20 {
21 "selector": " ",
22 "column": " ",
23 "from": "text",
24 "iframe_selector": ""
25 },
26 {
27 "selector": " ",
28 "column": " ",
29 "from": "text",
30 "iframe_selector": ""
31 }
32 ]
33 },
34 {
35 "selector": " ",//字段选择器
36 "column": " ",
37 "from": "text",
38 "iframe_selector": ""
39 },
40 {
41 "selector": " ",//字段选择器
42 "column": " ",
43 "from": "text",
44 "iframe_selector": ""
45 }
46 ]
47 }
脚本定义好了,剩下的就是写js代码解析脚本,做数据采集,数据合并了。 那么怎么去解析实现呢,针对新开标签页的数据采集,怎么样要和之前的列表项数据做合并,保证数据的完整性呢? 1.因为数据需要做存储,首先想到这么多数据该怎么存储呢,首先想到sessionStorage,但是sessionStorage在我新开标签页的时候数据不能共享, 那么就用localStorage,localStorage一般上限5m左右,足以存储一般列表的十几页数据。 2.详情页面的数据和列表项数据合并,既然上面说到localStorage,那么就在localStorage里面放入一个指定的map,存放列表数据 针对列表的每一项做一个key,然后再新开标签的时候传递key,提取详情的数据,将详情页面数据,放入map中指定key的数据中。 js实现map方便数据存储:
1 /*
2 * MAP对象,实现MAP功能
3 *
4 * 接口:
5 * size() 获取MAP元素个数
6 * isEmpty() 判断MAP是否为空
7 * clear() 删除MAP所有元素
8 * put(key, value) 向MAP中增加元素(key, value)
9 * remove(key) 删除指定KEY的元素,成功返回True,失败返回False
10 * get(key) 获取指定KEY的元素值VALUE,失败返回NULL
11 * element(index) 获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL
12 * containsKey(key) 判断MAP中是否含有指定KEY的元素
13 * containsValue(value) 判断MAP中是否含有指定VALUE的元素
14 * values() 获取MAP中所有VALUE的数组(ARRAY)
15 * keys() 获取MAP中所有KEY的数组(ARRAY)
16 */
17 function Map() {
18 this.elements = [];
19
20 //获取MAP元素个数
21 this.size = function () {
22 return this.elements.length;
23 };
24
25 //判断MAP是否为空
26 this.isEmpty = function () {
27 return (this.elements.length < 1);
28 };
29
30 //删除MAP所有元素
31 this.clear = function () {
32 this.elements = [];
33 };
34
35 //向MAP中增加元素(key, value)
36 this.put = function (_key, _value) {
37 for (var i = 0; i < this.elements.length; i++) {
38 if (this.elements[i].key == _key) {
39 this.elements[i].value = _value;
40 return;
41 }
42 }
43 this.elements.push({
44 key: _key,
45 value: _value
46 });
47 };
48
49 //删除指定KEY的元素,成功返回True,失败返回False
50 this.remove = function (_key) {
51 var bln = false;
52 try {
53 for (var i = 0; i < this.elements.length; i++) {
54 if (this.elements[i].key == _key) {
55 this.elements.splice(i, 1);
56 return true;
57 }
58 }
59 } catch (e) {
60 bln = false;
61 }
62 return bln;
63 };
64
65 //获取指定KEY的元素值VALUE,失败返回NULL
66 this.get = function (_key) {
67 try {
68 for (var i = 0; i < this.elements.length; i++) {
69 if (this.elements[i].key == _key) {
70 return this.elements[i].value;
71 }
72 }
73 } catch (e) {
74 return null;
75 }
76 };
77
78 //获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL
79 this.element = function (_index) {
80 if (_index < 0 || _index >= this.elements.length) {
81 return null;
82 }
83 return this.elements[_index];
84 };
85
86 //判断MAP中是否含有指定KEY的元素
87 this.containsKey = function (_key) {
88 var bln = false;
89 try {
90 for (var i = 0; i < this.elements.length; i++) {
91 if (this.elements[i].key == _key) {
92 bln = true;
93 }
94 }
95 } catch (e) {
96 bln = false;
97 }
98 return bln;
99 };
100
101 //判断MAP中是否含有指定VALUE的元素
102 this.containsValue = function (_value) {
103 var bln = false;
104 try {
105 for (var i = 0; i < this.elements.length; i++) {
106 if (this.elements[i].value == _value) {
107 bln = true;
108 }
109 }
110 } catch (e) {
111 bln = false;
112 }
113 return bln;
114 };
115
116 //获取MAP中所有VALUE的数组(ARRAY)
117 this.values = function () {
118 var arr = [];
119 for (var i = 0; i < this.elements.length; i++) {
120 arr.push(this.elements[i].value);
121 }
122 return arr;
123 };
124
125 //获取MAP中所有KEY的数组(ARRAY)
126 this.keys = function () {
127 var arr = [];
128 for (var i = 0; i < this.elements.length; i++) {
129 arr.push(this.elements[i].key);
130 }
131 return arr;
132 };
133 }
js实现操作localStorage:
1 /**
2 *获取当前任务配置信息
3 */
4 function getTaskDataMap() {
5 var data_maps = localStorage.getItem("data_maps");
6 var datas = new Map();
7 if (isNullParam(data_maps)) {
8 data_maps = datas;
9 } else {
10 datas.elements = JSON.parse(data_maps).elements;
11 return datas;
12 }
13 return data_maps;
14 }
15
16 /**
17 *清空当前任务配置信息
18 */
19 function clearTaskDataMap() {
20 localStorage.setItem("data_maps", "");
21 }
22
23 /**
24 * 当前任务添加配置信息
25 * @param step_id 脚本步骤id
26 * @param config [doms,json]
27 */
28 function addTaskDataMap(key, values) {
29 if (isNullParam(key) || isNullParam(values))
30 return;
31 var data_maps = getTaskDataMap();
32 data_maps.put(key, values);
33 localStorage.setItem("data_maps", JSON.stringify(data_maps));
34 }
采用jquery.simulate.js实现点击
1 /*!
2 * jQuery Simulate v@VERSION - simulate browser mouse and keyboard events
3 * https://github.com/jquery/jquery-simulate
4 *
5 * Copyright jQuery Foundation and other contributors
6 * Released under the MIT license.
7 * http://jquery.org/license
8 *
9 * Date: @DATE
10 */
11
12 ;(function ($, undefined) {
13
14 var rkeyEvent = /^key/,
15 rmouseEvent = /^(?:mouse|contextmenu)|click/;
16
17 $.fn.simulate = function (type, options) {
18 return this.each(function () {
19 new $.simulate(this, type, options);
20 });
21 };
22
23 $.simulate = function (elem, type, options) {
24 var method = $.camelCase("simulate-" + type);
25
26 this.target = elem;
27 this.options = options;
28
29 if (this[method]) {
30 this[method]();
31 } else {
32 this.simulateEvent(elem, type, options);
33 }
34 };
35
36 $.extend($.simulate, {
37
38 keyCode: {
39 BACKSPACE: 8,
40 COMMA: 188,
41 DELETE: 46,
42 DOWN: 40,
43 END: 35,
44 ENTER: 13,
45 ESCAPE: 27,
46 HOME: 36,
47 LEFT: 37,
48 NUMPAD_ADD: 107,
49 NUMPAD_DECIMAL: 110,
50 NUMPAD_DIVIDE: 111,
51 NUMPAD_ENTER: 108,
52 NUMPAD_MULTIPLY: 106,
53 NUMPAD_SUBTRACT: 109,
54 PAGE_DOWN: 34,
55 PAGE_UP: 33,
56 PERIOD: 190,
57 RIGHT: 39,
58 SPACE: 32,
59 TAB: 9,
60 UP: 38
61 },
62
63 buttonCode: {
64 LEFT: 0,
65 MIDDLE: 1,
66 RIGHT: 2
67 }
68 });
69
70 $.extend($.simulate.prototype, {
71
72 simulateEvent: function (elem, type, options) {
73 var event = this.createEvent(type, options);
74 this.dispatchEvent(elem, type, event, options);
75 },
76
77 createEvent: function (type, options) {
78 if (rkeyEvent.test(type)) {
79 return this.keyEvent(type, options);
80 }
81
82 if (rmouseEvent.test(type)) {
83 return this.mouseEvent(type, options);
84 }
85 },
86
87 mouseEvent: function (type, options) {
88 var event, eventDoc, doc, body;
89 options = $.extend({
90 bubbles: true,
91 cancelable: (type !== "mousemove"),
92 view: window,
93 detail: 0,
94 screenX: 0,
95 screenY: 0,
96 clientX: 1,
97 clientY: 1,
98 ctrlKey: false,
99 altKey: false,
100 shiftKey: false,
101 metaKey: false,
102 button: 0,
103 relatedTarget: undefined
104 }, options);
105
106 if (document.createEvent) {
107 event = document.createEvent("MouseEvents");
108 event.initMouseEvent(type, options.bubbles, options.cancelable,
109 options.view, options.detail,
110 options.screenX, options.screenY, options.clientX, options.clientY,
111 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,
112 options.button, options.relatedTarget || document.body.parentNode);
113
114 // IE 9+ creates events with pageX and pageY set to 0.
115 // Trying to modify the properties throws an error,
116 // so we define getters to return the correct values.
117 if (event.pageX === 0 && event.pageY === 0 && Object.defineProperty) {
118 eventDoc = event.relatedTarget.ownerDocument || document;
119 doc = eventDoc.documentElement;
120 body = eventDoc.body;
121
122 Object.defineProperty(event, "pageX", {
123 get: function () {
124 return options.clientX +
125 ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) -
126 ( doc && doc.clientLeft || body && body.clientLeft || 0 );
127 }
128 });
129 Object.defineProperty(event, "pageY", {
130 get: function () {
131 return options.clientY +
132 ( doc && doc.scrollTop || body && body.scrollTop || 0 ) -
133 ( doc && doc.clientTop || body && body.clientTop || 0 );
134 }
135 });
136 }
137 } else if (document.createEventObject) {
138 event = document.createEventObject();
139 $.extend(event, options);
140 // standards event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ff974877(v=vs.85).aspx
141 // old IE event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ms533544(v=vs.85).aspx
142 // so we actually need to map the standard back to oldIE
143 event.button = {
144 0: 1,
145 1: 4,
146 2: 2
147 }[event.button] || ( event.button === -1 ? 0 : event.button );
148 }
149
150 return event;
151 },
152
153 keyEvent: function (type, options) {
154 var event;
155 options = $.extend({
156 bubbles: true,
157 cancelable: true,
158 view: window,
159 ctrlKey: false,
160 altKey: false,
161 shiftKey: false,
162 metaKey: false,
163 keyCode: 0,
164 charCode: undefined
165 }, options);
166
167 if (document.createEvent) {
168 try {
169 event = document.createEvent("KeyEvents");
170 event.initKeyEvent(type, options.bubbles, options.cancelable, options.view,
171 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,
172 options.keyCode, options.charCode);
173 // initKeyEvent throws an exception in WebKit
174 // see: http://stackoverflow.com/questions/6406784/initkeyevent-keypress-only-works-in-firefox-need-a-cross-browser-solution
175 // and also https://bugs.webkit.org/show_bug.cgi?id=13368
176 // fall back to a generic event until we decide to implement initKeyboardEvent
177 } catch (err) {
178 event = document.createEvent("Events");
179 event.initEvent(type, options.bubbles, options.cancelable);
180 $.extend(event, {
181 view: options.view,
182 ctrlKey: options.ctrlKey,
183 altKey: options.altKey,
184 shiftKey: options.shiftKey,
185 metaKey: options.metaKey,
186 keyCode: options.keyCode,
187 charCode: options.charCode
188 });
189 }
190 } else if (document.createEventObject) {
191 event = document.createEventObject();
192 $.extend(event, options);
193 }
194
195 if (!!/msie [\w.]+/.exec(navigator.userAgent.toLowerCase()) || (({}).toString.call(window.opera) === "[object Opera]")) {
196 event.keyCode = (options.charCode > 0) ? options.charCode : options.keyCode;
197 event.charCode = undefined;
198 }
199
200 return event;
201 },
202
203 dispatchEvent: function (elem, type, event) {
204 if (elem.dispatchEvent) {
205 elem.dispatchEvent(event);
206 } else if (type === "click" && elem.click && elem.nodeName.toLowerCase() === "input") {
207 elem.click();
208 } else if (elem.fireEvent) {
209 elem.fireEvent("on" + type, event);
210 }
211 },
212
213 simulateFocus: function () {
214 var focusinEvent,
215 triggered = false,
216 element = $(this.target);
217
218 function trigger() {
219 triggered = true;
220 }
221
222 element.bind("focus", trigger);
223 element[0].focus();
224
225 if (!triggered) {
226 focusinEvent = $.Event("focusin");
227 focusinEvent.preventDefault();
228 element.trigger(focusinEvent);
229 element.triggerHandler("focus");
230 }
231 element.unbind("focus", trigger);
232 },
233
234 simulateBlur: function () {
235 var focusoutEvent,
236 triggered = false,
237 element = $(this.target);
238
239 function trigger() {
240 triggered = true;
241 }
242
243 element.bind("blur", trigger);
244 element[0].blur();
245
246 // blur events are async in IE
247 setTimeout(function () {
248 // IE won't let the blur occur if the window is inactive
249 if (element[0].ownerDocument.activeElement === element[0]) {
250 element[0].ownerDocument.body.focus();
251 }
252
253 // Firefox won't trigger events if the window is inactive
254 // IE doesn't trigger events if we had to manually focus the body
255 if (!triggered) {
256 focusoutEvent = $.Event("focusout");
257 focusoutEvent.preventDefault();
258 element.trigger(focusoutEvent);
259 element.triggerHandler("blur");
260 }
261 element.unbind("blur", trigger);
262 }, 1);
263 }
264 });
265
266
267 /** complex events **/
268
269 function findCenter(elem) {
270 var offset,
271 document = $(elem.ownerDocument);
272 elem = $(elem);
273 offset = elem.offset();
274
275 return {
276 x: offset.left + elem.outerWidth() / 2 - document.scrollLeft(),
277 y: offset.top + elem.outerHeight() / 2 - document.scrollTop()
278 };
279 }
280
281 function findCorner(elem) {
282 var offset,
283 document = $(elem.ownerDocument);
284 elem = $(elem);
285 offset = elem.offset();
286
287 return {
288 x: offset.left - document.scrollLeft(),
289 y: offset.top - document.scrollTop()
290 };
291 }
292
293 $.extend($.simulate.prototype, {
294 simulateDrag: function () {
295 var i = 0,
296 target = this.target,
297 eventDoc = target.ownerDocument,
298 options = this.options,
299 center = options.handle === "corner" ? findCorner(target) : findCenter(target),
300 x = Math.floor(center.x),
301 y = Math.floor(center.y),
302 coord = {clientX: x, clientY: y},
303 dx = options.dx || ( options.x !== undefined ? options.x - x : 0 ),
304 dy = options.dy || ( options.y !== undefined ? options.y - y : 0 ),
305 moves = options.moves || 3;
306
307 this.simulateEvent(target, "mousedown", coord);
308
309 for (; i < moves; i++) {
310 x += dx / moves;
311 y += dy / moves;
312
313 coord = {
314 clientX: Math.round(x),
315 clientY: Math.round(y)
316 };
317
318 this.simulateEvent(eventDoc, "mousemove", coord);
319 }
320
321 if ($.contains(eventDoc, target)) {
322 this.simulateEvent(target, "mouseup", coord);
323 this.simulateEvent(target, "click", coord);
324 } else {
325 this.simulateEvent(eventDoc, "mouseup", coord);
326 }
327 }
328 });
329
330 })(jQuery);
格式化json数据,高亮显示
1 /**
2 * 格式化json
3 * @param json
4 * @returns {string|XML}
5 */
6 function jsonSyntaxHighLight(json) {
7 if (typeof json != 'string')
8 json = JSON.stringify(json, undefined, 2);
9 json = json.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
10 return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function (match) {
11 var cls = 'number';
12 if (/^"/.test(match)) {
13 if (/:$/.test(match)) {
14 cls = 'key';
15 } else {
16 cls = 'string';
17 }
18 } else if (/true|false/.test(match)) {
19 cls = 'boolean';
20 } else if (/null/.test(match)) {
21 cls = 'null';
22 }
23 return '<span class="' + cls + '">' + match + '</span>';
24 });
25 }
操作: (以懒财网公告为例,测试)目前已经测试懒财,cnblog。。。 1.首先安装tampermonkey插件下载地址: http://tampermonkey.net/ 2.新建脚本,复制web-extract-list.js 内容粘贴 ctrl+s 3.新建脚本,复制web-extract-detail.js 内容粘贴 ctrl+s 4.打开https://www.lancai.cn/about/notice.html 看执行效果
采集结束之后,json页面:
注意:根据采集的网站不同需要变更js文件里面的// @match 处匹配的url, 以及task_json的脚本配置信息
项目代码github地址:https://github.com/jstarseven/web-list-extract
码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html
-END-