Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cli
chapter5/downloaded/
2 changes: 1 addition & 1 deletion chapter1/2-beautifulSoup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
bsObj = BeautifulSoup(html.read())
bsObj = BeautifulSoup(html.read(), "html.parser")
print(bsObj.h1)
6 changes: 3 additions & 3 deletions chapter1/3-exceptionHandling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def getTitle(url):
print(e)
return None
try:
bsObj = BeautifulSoup(html.read())
bsObj = BeautifulSoup(html.read(), "html.parser")
title = bsObj.body.h1
except AttributeError as e:
return None
Expand All @@ -22,5 +22,5 @@ def getTitle(url):
print("Title could not be found")
else:
print(title)


4 changes: 2 additions & 2 deletions chapter2/1-selectByClass.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
nameList = bsObj.findAll("span", {"class":"green"})
for name in nameList:
print(name.get_text())
print(name.get_text())
4 changes: 2 additions & 2 deletions chapter2/2-selectByAttribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
allText = bsObj.findAll(id="text")
print(allText[0].get_text())
print(allText[0].get_text())
4 changes: 2 additions & 2 deletions chapter2/3-findDescendants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")

for child in bsObj.find("table",{"id":"giftList"}).children:
print(child)
print(child)
4 changes: 2 additions & 2 deletions chapter2/4-findSiblings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")

for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings:
print(sibling)
print(sibling)
4 changes: 2 additions & 2 deletions chapter2/5-findParents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
from bs4 import BeautifulSoup

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
bsObj = BeautifulSoup(html, "html.parser")
print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
4 changes: 2 additions & 2 deletions chapter2/6-regularExpressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
for image in images:
print(image["src"])
4 changes: 2 additions & 2 deletions chapter2/7-lambdaExpressions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page2.html")
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2)
for tag in tags:
print(tag)
print(tag)
4 changes: 2 additions & 2 deletions chapter3/1-getWikiLinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
links = getLinks(newArticle)
6 changes: 3 additions & 3 deletions chapter3/2-crawlWikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
def getLinks(pageUrl):
global pages
html = urlopen("http://en.wikipedia.org"+pageUrl)
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
try:
print(bsObj.h1.get_text())
print(bsObj.find(id ="mw-content-text").findAll("p")[0])
print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print("This page is missing something! No worries though!")

for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
Expand All @@ -22,4 +22,4 @@ def getLinks(pageUrl):
print("----------------\n"+newPage)
pages.add(newPage)
getLinks(newPage)
getLinks("")
getLinks("")
16 changes: 8 additions & 8 deletions chapter3/3-crawlSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def getInternalLinks(bsObj, includeUrl):
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
Expand All @@ -34,18 +34,18 @@ def splitAddress(address):

def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
if len(externalLinks) == 0:
internalLinks = getInternalLinks(startingPage)
return getNextExternalLink(internalLinks[random.randint(0,
internalLinks = getInternalLinks(bsObj, startingPage)
return getExternalLinks(bsObj, internalLinks[random.randint(0,
len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
externalLink = getRandomExternalLink("http://oreilly.com")
externalLink = getRandomExternalLink(startingSite)
print("Random external link is: "+externalLink)
followExternalOnly(externalLink)
followExternalOnly("http://oreilly.com")

followExternalOnly("http://oreilly.com")
6 changes: 3 additions & 3 deletions chapter3/4-getExternalLinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def getInternalLinks(bsObj, includeUrl):
else:
internalLinks.append(link.attrs['href'])
return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
Expand All @@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl):

def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("No external links, looking around the site for one")
Expand All @@ -45,7 +45,7 @@ def getRandomExternalLink(startingPage):
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("Random external link is: "+externalLink)
Expand Down
8 changes: 4 additions & 4 deletions chapter3/5-getAllExternalLinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def getInternalLinks(bsObj, includeUrl):
else:
internalLinks.append(link.attrs['href'])
return internalLinks

#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
Expand All @@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl):

def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html)
bsObj = BeautifulSoup(html, "html.parser")
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("No external links, looking around the site for one")
Expand All @@ -45,12 +45,12 @@ def getRandomExternalLink(startingPage):
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print("Random external link is: "+externalLink)
followExternalOnly(externalLink)

#Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()
Expand Down
18 changes: 0 additions & 18 deletions chapter3/scrapy/wikiSpider/wiki.log

This file was deleted.

Binary file modified chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc
Binary file not shown.
Binary file modified chapter3/scrapy/wikiSpider/wikiSpider/items.pyc
Binary file not shown.
Binary file modified chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc
Binary file not shown.
Binary file modified chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc
Binary file not shown.
Binary file modified chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc
Binary file not shown.
61 changes: 61 additions & 0 deletions chapter4/6-wikiHistories-Chinese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from urllib.request import urlopen
from urllib.request import HTTPError
from bs4 import BeautifulSoup
import datetime
import json
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html, "html.parser")
return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

def getHistoryIPs(pageUrl):
#Format of revision history pages is:
#http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
pageUrl = pageUrl.replace("/wiki/", "")
historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
print("history url is: "+historyUrl)
html = urlopen(historyUrl)
bsObj = BeautifulSoup(html, "html.parser")
#finds only the links with class "mw-anonuserlink" which has IP addresses
#instead of usernames
ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
addressList = set()
for ipAddress in ipAddresses:
addressList.add(ipAddress.get_text())
return addressList


def getCountry(ipAddress):
try:
html = urlopen("http://www.ip138.com/ips1388.asp?action=2&ip="+ipAddress).read().decode('gb2312')
except HTTPError:
return None
try:
bsObj = BeautifulSoup(html, "html.parser")
try:
response = bsObj.findAll( text=re.compile(":"))[0].split(":")[2]
except IndexError:
response = bsObj.findAll( text=re.compile("数据"))[0:2]
except AttributeError:
return None

return str(response)

links = getLinks("/wiki/Python_(programming_language)")


while(len(links) > 0):
for link in links:
print("-------------------")
historyIPs = getHistoryIPs(link.attrs["href"])
for historyIP in historyIPs:
country = getCountry(historyIP)
if country is not None:
print(historyIP+" is from "+country)

newLink = links[random.randint(0, len(links)-1)].attrs["href"]
links = getLinks(newLink)
44 changes: 44 additions & 0 deletions chapter4/6-wikiHistories-no-locations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from urllib.request import urlopen
from urllib.request import HTTPError
from bs4 import BeautifulSoup
import datetime
import json
import random
import re

random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html = urlopen("http://en.wikipedia.org"+articleUrl)
bsObj = BeautifulSoup(html, "html.parser")
return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

def getHistoryIPs(pageUrl):
#Format of revision history pages is:
#http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
pageUrl = pageUrl.replace("/wiki/", "")
historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
print("history url is: "+historyUrl)
html = urlopen(historyUrl)
bsObj = BeautifulSoup(html, "html.parser")
#finds only the links with class "mw-anonuserlink" which has IP addresses
#instead of usernames
ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
addressList = set()
for ipAddress in ipAddresses:
addressList.add(ipAddress.get_text())
return addressList



links = getLinks("/wiki/Python_(programming_language)")


while(len(links) > 0):
for link in links:
print("-------------------")
historyIPs = getHistoryIPs(link.attrs["href"])
for historyIP in historyIPs:
print(historyIP)

newLink = links[random.randint(0, len(links)-1)].attrs["href"]
links = getLinks(newLink)
6 changes: 6 additions & 0 deletions chapter4/google-api-key.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
AIzaSyD9Dns12MuQ0ZtLFh-fvjdlpRSavXw6lRM

Usage:
curl -d @google-maps-geoapi-example.json -H "Content-Type: application/json" -i "https://www.googleapis.com/geolocation/v1/geolocate?key=AIzaSyD9Dns12MuQ0ZtLFh-fvjdlpRSavXw6lRM"

Warning: Replace key=[Your Key Above]
31 changes: 31 additions & 0 deletions chapter4/google-maps-geoapi-example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"homeMobileCountryCode": 310,
"homeMobileNetworkCode": 260,
"radioType": "gsm",
"carrier": "T-Mobile",
"cellTowers": [
{
"cellId": 39627456,
"locationAreaCode": 40495,
"mobileCountryCode": 310,
"mobileNetworkCode": 260,
"age": 0,
"signalStrength": -95
}
],
"wifiAccessPoints": [
{
"macAddress": "01:23:45:67:89:AB",
"signalStrength": 8,
"age": 0,
"signalToNoiseRatio": -65,
"channel": 8
},
{
"macAddress": "01:23:45:67:89:AC",
"signalStrength": 4,
"age": 0
}
]
}

Loading