-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_fb.py
107 lines (91 loc) · 3.57 KB
/
scrape_fb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import urllib2
import json
import datetime
import time
import re
import csv
import sys
def get_page(app_id, app_secret):
"""
initial query to nyt facebook
app_id and app_secret are concatenated for an access token
see documentation here: https://developers.facebook.com/docs/facebook-login/access-tokens
"""
baseurl = 'https://graph.facebook.com/v2.8/nytimes/feed/'
access_token = '&access_token=' + app_id + '|' + app_secret
fields = '?fields=id,name,link,created_time,reactions.summary(true),comments.summary(true),shares&limit=25'
url = baseurl+fields+access_token
dat = json.loads(request_until_succeed(url))
return(dat)
def request_until_succeed(url):
"""
handles potential errors when sending a request
"""
req = urllib2.Request(url)
success = False
while success is False:
try:
response = urllib2.urlopen(req)
if response.getcode() == 200:
success = True
except Exception, e:
print e
time.sleep(5)
return response.read()
def page_through(app_id, app_secret):
"""
pages through the nyt facebook page until there are no more pages, or the posts occur before 11/1
"""
has_next_page = True
in_date_range = True
#we only want to keep the articles that were returned from the NYtimes api, so this creates a list of target urls
with open('output/article_search.json') as f:
nyt_dat = json.load(f)
nyt_urls = []
for i in nyt_dat:
nyt_urls.append(core_url(i['web_url']))
items = get_page(app_id, app_secret)
process_items(items, nyt_urls)
while has_next_page & in_date_range:
if 'paging' not in items.keys():
has_next_page=False
if items['data'][0]['created_time'][0:7]=='2016-10':
in_date_range = False
items = json.loads(request_until_succeed(items['paging']['next']))
process_items(items, nyt_urls)
def process_items(json_dat, target_urls):
"""
json_dat is the data returned from the fb graph api query
target_urls is the list of urls returned from our nytimes api query
The function checks the link for each facebook post - if the link is in the NYT data, it writes the metadata into a csv file
"""
print('********Writing file for posts at %s' % json_dat['data'][0]['created_time'])
with open('output/fb_dat/%s_facebook_dat.csv' % datetime.datetime.now(), 'wb') as outfile:
w = csv.writer(outfile)
w.writerow(['id', 'link', 'reactions', 'shares', 'can_comment', 'comments'])
for item in json_dat['data']:
try:
if core_url(item['link']) in target_urls:
w.writerow([item['id'],
item['link'],
item['reactions']['summary']['total_count'],
item['shares']['count'],
item['comments']['summary']['can_comment'],
item['comments']['summary']['total_count']])
except:
pass
def core_url(url):
"""
quick function to extract just the base part of the url.
"""
m = re.search('(http(s)?://(((www|.+\.blogs)\.nytimes\.com.+)(roomfordebate/.+|/$|.html)))', url)
if m:
return(m.group(3))
else:
return('********url %s not pulled' % url)
def main(argv1, argv2):
app_id = argv1
app_secret = argv2
dat = page_through(app_id, app_secret)
if __name__=='__main__':
main(sys.argv[1], sys.argv[2])