有时候想将整个会议的论文下载下来,手动太麻烦,应该浏览器插件完成的,没有去搜,写了个python 脚本来解决。 将url 页面的指定类型文件下载下来!
格式:python xx.py url file_type
参数一:url 地址;参数二:文件类型
1: #!/usr/bin/env python
2: #encoding=utf-8
3:
4: import urllib, urllib2
5: import re
6: import os,sys
7:
8: def get_files(ourl,file_type):
9: print "The URL is "+ourl
10: print "The File Type is "+file_type
11: path="E:\\temp\\"
12: if os.path.exists(path):
13: pass
14: else:
15: os.mkdir(path)
16: print "accessing "+ourl
17: print "===>>>href<<<==="
18: tempstr='href=\"(\S{3,50}\.'+file_type+'\w{0,2})\"'
19: htmldata=urllib2.urlopen(ourl).read()
20: fileslist=re.findall(tempstr,htmldata)
21: if len(fileslist)==0:
22: print "no"+" ."+file_type+" files"
23: else:
24: for app in fileslist:
25: if (ourl[-1]=='/'):
26: pass
27: else:
28: ourl=ourl[:ourl.rindex("/")+1]
29: if (app[0:7]=='http://'):
30: url=app
31: else:
32: url=ourl+app
33: filedata=app
34: try:
35: print url+"\tdownloading ......"
36: filedata=urllib2.urlopen(url).read()
37: print "read "+url
38: filestr=path+url[url.rindex("/")+1:]
39: print "file is "+filestr
40: fp=open(filestr,'wb')
41: fp.write(filedata)
42: fp.close()
43: except:
44: print "cann't get "+url
45: print "===>>>src<<<==="
46: tempstr='src=\"(\S{3,50}\.'+file_type+'\w{0,2})\"'
47: htmldata=urllib2.urlopen(ourl).read()
48: fileslist=re.findall(tempstr,htmldata)
49: if len(fileslist)==0:
50: print "no"+" ."+file_type+" files"
51: else:
52: for app in fileslist:
53: if (app[0:7]=='http://'):
54: url=app
55: else:
56: url=ourl+app
57: filedata=app
58: try:
59: print url+"\tdownloading ......"
60: filedata=urllib2.urlopen(url).read()
61: print "read "+url
62: filestr=path+url[url.rindex("/")+1:]
63: print "file is "+filestr
64: fp=open(filestr,'wb')
65: fp.write(filedata)
66: fp.close()
67: except:
68: print "cann't get >> "+url
69:
70: if __name__ == "__main__":
71: ourl=sys.argv[1];
72: file_type=sys.argv[2];
73: get_files(ourl,file_type)