想写个爬人人网关系的一个小爬虫,能够从我的账户开始,爬我的朋友的朋友的朋友……。当遇到朋友隐私设置为不可读时,希望后期能够通过遍历所有其他人的朋友反推出该人的朋友,这就就需要把人人网绝大部分人的关系爬出来。单线程实测了一下,大概每小时能爬一万个用户,流量在200kb左右,速度明显不够,后面希望多线程能给力些,并且有问题再爬到7 万用户的时候出现了死循环,问题还没找出来。
考虑到后面优化的几点
多线程
效率(换语言?)
爬id 和id 差重应该分开处理
程序说明:getone() 从todolist 中获取一个未爬过的用户,savenews() 将爬出来的用户id 保存并去重。每一千个用户保存一个文件,保存形式为:该用户ID@用户好友1ID 用户好友2ID ……
源码:
1: #!/usr/bin/env python
2: #encoding=utf-8
3:
4: import urllib, urllib2, cookielib, re, sys
5:
6: class Renren(object):
7:
8: def __init__(self,email,password):
9: self.email=email
10: self.password=password
11: self.origURL='http://www.renren.com/Home.do'
12: self.domain='renren.com'
13: # 如果有本地cookie,登录时无需验证。
14: self.cj = cookielib.LWPCookieJar()
15: try:
16: self.cj.revert('renren,cookie')
17: except:
18: None
19: self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
20: urllib2.install_opener(self.opener)
21:
22: def login(self):
23: params = {'email':self.email,'password':self.password,'origURL':self.origURL,'domain':self.domain}
24: req = urllib2.Request(
25: 'http://www.renren.com/PLogin.do',
26: urllib.urlencode(params)
27: )
28: r = self.opener.open(req)
29:
30: def friends(self):
31: #好友目录地址
32: req='http://friend.renren.com/myfriendlistx.do'
33: print "Get my friends"
34: r=self.opener.open(req)
35: data=r.read()
36: f=re.findall('"id":(\d{6,15}),',data)
37: print "friends list"
38: print f,len(f)
39: #todo
40: self.todolist=f
41: self.donelist=[]
42: #write data
43: fdata=open('data0.txt','w')
44: for item in f:
45: fdata.write(item+' ')
46: fdata.close()
47: sernum=0
48: while True:
49: temp1w={}
50: sernum=sernum+1
51: print "data"+str(sernum)
52: count=1000
53: while count>0:
54: count=count-1
55: rrid=self.getone() #从todo 里面取一个数据
56: print count,rrid
57: f=self.getfriends(rrid)
58: self.savenews(f) #保存该组数据到todo
59: templst=''
60: for eachid in f:
61: templst=eachid+' '+templst
62: temp1w[rrid]=templst
63: #将count 个结果写到文件
64: filename="data_"+str(sernum)+".txt"
65: fp=open(filename,'w')
66: for each in temp1w:
67: fp.write(each+'@'+temp1w[each])
68: fp.write('\n')
69: def getfriends(self,rrid):
70: friends=[]
71: count=0
72: while True:
73: req="http://friend.renren.com/GetFriendList.do?curpage="+str(count)+'&id='+rrid
74: print 'Get',req
75: r=self.opener.open(req)
76: data=r.read()
77: f=re.findall('profile.do\?id=(\d{7,15})"><img',data)
78: friends=friends+f
79: count=count+1
80: if f==[]:
81: exit()
82: return friends
83:
84: def getone(self):
85: if self.todolist==[]:
86: print "Empty todo list"
87: popup=self.todolist[1] #选择第一个id
88: self.donelist.append(popup) #加入到done 列表
89: del self.todolist[1] #在todo 中删除
90: return popup #返回
91:
92: def savenews(self,newlist):
93: for item in newlist: #删除出现在了done 列表中的id
94: if item in self.donelist:
95: newlist.remove(item)
96: self.todolist=self.todolist+newlist #添加到todo 列表中
97: self.todolist=list(set(self.todolist)) #去掉重复元素
98:
99:
100:
101: if __name__ == "__main__":
102: semail='[email protected]'
103: spassword='xxxxxxxxxx'
104: a=Renren(semail,spassword)
105: print "your account and password are %s %s" % (semail, spassword)
106: a.login()
107: a.friends()