下载漫画只需一个脚本



近来比较闲,想利用这闲暇的时光恶补下op漫画。打开鼠绘漫画网,找到onepiece的目录,点击需要补习的篇章。本来准备吹着空调,吃着西瓜愉快的享受op的世界了。没想到的事发生了,在鼠标点击的瞬间出现了跳转




然后就眼睁睁的跳转到腾讯蛋疼的简体汉化版,此时心中无数个草泥马在奔腾啊!!!

由于版权纷争,鼠绘为了生计不得不退出。并且转型为漫画讨论社区,虽然最近几期都给了第三方链接,但是总给人的感觉是偷呢?好在,手机端APP还保留着漫画阅读功能,但是由于屏幕狭小,体验不是很好。于是就有了本篇简单爬虫想法。

前期利用App进行抓包调研鼠绘网后台接口,得到了几个关键的接口。之后就写了如下脚本:
脚本下载地址:cartoon_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#! /usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import socket
import json
import os
import HTMLParser
import threading
import time
import sys
global jsonstr, hostname, croot, maxtimeout, tryagaincount
global threadcount, currentthreadnum, mutex, thenewest
hostname = r'http://www.ishuhui.net'
chapters = r'/ComicBooks/GetChapterList?id=' + '%d' + '&PageIndex=' + '%d'
details = r'/ComicBooks/ReadComicBooksToIsoV1/' + '%d'
repo = {
'op': 2,
'SLAM_DUNK': 38,
'火影忍者': 4,
'银魂': 10,
'妖精的尾巴': 3,
'名侦探柯南': 1,
'bleach': 23,
'黑子的篮球': 6,
'浪客剑心': 39,
'结界师': 34
}
arg = '[op, SLAM_DUNK, 火影忍者, 银魂, 妖精的尾巴, 名侦探柯南, bleach, 黑子的篮球, 浪客剑心, 结界师]'
cartoonid = repo['op']
thenewest = 0
currentthreadnum = 0
threadcount = 6
tryagaincount = 5
maxtimeout = 30
croot = os.getcwd()
mutex = threading.Lock()
usage = \
"""
Usage:
cartoon_download [args...]
cartoon_download cartoon
cartoon_download cartoon path
cartoon_download cartoon path newestcount
cartoon_download cartoon path newestcount threadcount
For example:
cartoon_download op /home/xxx/onepiece
Note:
current version just support Linux/Unix os.
cartoon = %s
path can be either absolute or relative, but must be en characters.
default `newestcount` value is 0 to download all chapters, or
download the newest value chapters.
""" % arg
if len(sys.argv) == 1:
print usage
sys.exit(0)
if len(sys.argv) >= 2:
try:
cartoonid = repo[sys.argv[1]]
except Exception, e:
print 'Please select from %s' % arg
sys.exit(0)
finally:
pass
if len(sys.argv) >= 3:
targetdir = sys.argv[2]
if not os.path.exists(targetdir):
os.makedirs(targetdir)
croot = os.path.abspath(targetdir)
if len(sys.argv) >= 4:
thenewest = (int)(sys.argv[3])
if len(sys.argv) == 5:
threadcount = (int)(sys.argv[4])
class MyHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.gifs = []
self.jpgs = []
self.pngs = []
def handle_starttag(self, tags, attrs):
if tags == 'img':
for attr in attrs:
for htmlstr in attr:
if 'gif' in htmlstr:
self.gifs.append(htmlstr)
elif 'jpg' in htmlstr:
self.jpgs.append(htmlstr)
elif 'png' in htmlstr:
self.pngs.append(htmlstr)
else:
pass
def get_gifs(self):
return self.gifs
def get_jpgs(self):
return self.jpgs
def get_pngs(self):
return self.pngs
class DownloadTask(threading.Thread):
def __init__(self, name, srcurl):
threading.Thread.__init__(self)
self.name = name
self.srcurl = srcurl
def run(self):
pic2file(self.srcurl)
def createtask(imgs):
global mutex, currentthreadnum, threadcount
print 'current tasks num >> %d' % len(imgs)
for srcurl in imgs:
while (currentthreadnum >= threadcount):
time.sleep(0.5)
increasethread()
threadname = '#%s' % srcurl
task = DownloadTask(threadname, srcurl)
task.start()
while (currentthreadnum > 0):
time.sleep(0.5)
print 'finished!'
os.chdir(croot)
def parser2name(picurl):
names = picurl.split('/')
result = names[len(names) - 1]
if '?' in result:
result = result.split('?')[0]
return result
def increasethread():
global currentthreadnum, mutex
mutex.acquire()
currentthreadnum += 1
mutex.release()
def decreasethread():
global currentthreadnum, mutex
mutex.acquire()
currentthreadnum -= 1
mutex.release()
def pic2file(picurl, times=0):
protocol = picurl.split('/')[0]
if not(protocol == 'http:' or protocol == 'https:'):
decreasethread()
return
filename = parser2name(picurl)
if(os.path.exists(filename)):
decreasethread()
return
try:
pic = urllib.urlopen(picurl)
data = pic.read()
picfile = open('%s' % filename, 'wb')
picfile.write(data)
picfile.close()
decreasethread()
except socket.timeout:
if times < tryagaincount:
print "Download '%s' timeout, Trying again." % filename
pic2file(picurl, times + 1)
else:
decreasethread()
print "Tried %d times, but still failed to %s." %\
(tryagaincount, filename)
except Exception as e:
print('---pic2file error---', e)
if times < tryagaincount:
print "Download '%s' timeout, Trying again." % filename
pic2file(picurl, times + 1)
else:
decreasethread()
print "Task '%s' failed after tring %d times" %\
(picurl, tryagaincount)
finally:
pass
def fixurl(picurl):
piece = picurl.split('/')
url = 'http:'
for p in xrange(1, len(piece)):
url += '/%s' % piece[p]
return url
def fetchres(detailurl, dirpath, times=0):
try:
if not os.path.exists(dirpath):
os.mkdir(dirpath)
os.chdir(dirpath)
curdir = os.getcwd()
print 'Download for ' + curdir
detailbook = urllib.urlopen(detailurl).read()
htmlfile = open('%s.html' % parser2name(curdir), 'wb')
htmlfile.write(detailbook)
htmlfile.close()
parser = MyHTMLParser()
parser.feed(detailbook)
jpgs = parser.get_jpgs()
pngs = parser.get_pngs()
gifs = parser.get_gifs()
imgs = jpgs + pngs + gifs
createtask(imgs)
except socket.timeout:
print "Fetch '%s' timeout." % detailurl
if times < tryagaincount:
print "The no.%d times to try." % (times + 2)
fetchres(detailurl, dirpath, times + 1)
else:
print "Tried %d times, but still failed."
print "####Please check network!####"
except Exception, e:
print(e)
finally:
pass
def jsonparse():
encodejson = json.loads(jsonstr)
result = encodejson["Return"]["List"]
if thenewest > 0:
for i in xrange(0, thenewest):
parserandsavehtml(result[i])
return False
else:
for x in result:
parserandsavehtml(x)
return len(result)
def parserandsavehtml(item):
bookid = item["Id"]
detailurl = hostname + details % bookid + '.html'
chapterno = item["ChapterNo"]
title = item["Title"]
dirname = '%d %s' % (chapterno, title)
# Here cause an error `UnicodeEncodeError` when
# the 'croot' include chinese characters.
# details info can seach for http://www.cnblogs.com/abcat/p/3389531.html
targetpath = os.path.join(croot, dirname)
fetchres(detailurl, targetpath)
def calculatetime(used):
if used <= 60:
print 'Total used time is %ds.' % used
elif used <= 3600:
print 'Total used time is %dmins %ds.' % (used / 60, used % 60)
else:
print 'Total used time is %dhrs %dmins %ds.' %\
(used / 3600, (used % 3600) / 60, (used % 3600) % 60)
if __name__ == '__main__':
os.chdir(croot)
socket.setdefaulttimeout(maxtimeout)
start = time.time()
try:
i = 0
isbreak = True
while isbreak:
targetweb = hostname + chapters % (cartoonid, i)
webfile = urllib.urlopen(targetweb)
jsonstr = webfile.read()
isbreak = jsonparse()
i = i + 1
end = time.time()
calculatetime(int(end - start))
except socket.timeout:
print('timeout')
except Exception as e:
print('error', e)
finally:
pass


最后在po一张路飞仙人模式图O(∩_∩)O



声明:本脚本仅供学习交流使用,禁止用于商业用途,若带来商业纠纷与本人无关!