forked from fictive-kin/openrecipes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
thepioneerwoman_feedspider.py
85 lines (71 loc) · 3.38 KB
/
thepioneerwoman_feedspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector, XmlXPathSelector
from openrecipes.items import RecipeItem
class ThepioneerwomanfeedSpider(BaseSpider):
"""
This parses the RSS feed for thepioneerwoman.com, grabs the original
links to each entry, and scrapes just those pages. This should be used
to keep up to date after we have backfilled the existing recipes by
crawling the whole site
"""
name = "thepioneerwoman.feed"
allowed_domains = [
"thepioneerwoman.com",
"feeds.feedburner.com",
"feedproxy.google.com"
]
start_urls = [
"http://feeds.feedburner.com/pwcooks",
]
def parse(self, response):
"""
We define a custom parser here because we need to get the link from
the feed item and then follow it to get the recipe data.
Getting the data from <content:encoded> seems overly complex, as we
would have to decode all the encoded characters and then build a DOM
from that.
"""
xxs = XmlXPathSelector(response)
links = xxs.select("//item/*[local-name()='origLink']/text()").extract()
return [Request(x, callback=self.parse_item) for x in links]
def parse_item(self, response):
"""
this is identical to spiders.ThepioneerwomancrawlSpider.parse_item(),
which is probably not good. we should sort out a way to not repeat
ourselves
"""
hxs = HtmlXPathSelector(response)
base_path = """//div[@itemtype="http://data-vocabulary.org/Recipe"]"""
recipes_scopes = hxs.select(base_path)
name_path = '//meta[@property="og:title"]/@content'
description_path = '//meta[@property="og:description"]/@content'
url_path = '//meta[@property="og:url"]/@content'
image_path = '//meta[@property="og:image"][1]/@content'
prepTime_path = '*//*[@itemprop="prepTime"]/@datetime'
cookTime_path = '*//*[@itemprop="cookTime"]/@datetime'
recipeYield_path = '*//*[@itemprop="yield"]/text()'
ingredients_path = '*//*[@itemprop="ingredient"]'
datePublished = '*/*[@itemprop="published"]/@datetime'
recipes = []
for r_scope in recipes_scopes:
item = RecipeItem()
item['name'] = r_scope.select(name_path).extract()
item['image'] = r_scope.select(image_path).extract()
item['url'] = r_scope.select(url_path).extract()
item['description'] = r_scope.select(description_path).extract()
item['prepTime'] = r_scope.select(prepTime_path).extract()
item['cookTime'] = r_scope.select(cookTime_path).extract()
item['recipeYield'] = r_scope.select(recipeYield_path).extract()
ingredient_scopes = r_scope.select(ingredients_path)
ingredients = []
for i_scope in ingredient_scopes:
amount = i_scope.select('*[@itemprop="amount"]/text()').extract()
name = i_scope.select('*[@itemprop="name"]/text()').extract()
amount = "".join(amount).strip()
name = "".join(name).strip()
ingredients.append("%s %s" % (amount, name))
item['ingredients'] = ingredients
item['datePublished'] = r_scope.select(datePublished).extract()
recipes.append(item)
return recipes