REMitchell · nvmexp · Apr 23, 2016 · Apr 23, 2016 · Apr 23, 2016 · Apr 23, 2016
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+cli
+chapter5/downloaded/
diff --git a/chapter1/2-beautifulSoup.py b/chapter1/2-beautifulSoup.py
@@ -2,5 +2,5 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html")
-bsObj = BeautifulSoup(html.read())
+bsObj = BeautifulSoup(html.read(), "html.parser")
 print(bsObj.h1)
diff --git a/chapter1/3-exceptionHandling.py b/chapter1/3-exceptionHandling.py
@@ -11,7 +11,7 @@ def getTitle(url):
         print(e)
         return None
     try:
-        bsObj = BeautifulSoup(html.read())
+        bsObj = BeautifulSoup(html.read(), "html.parser")
         title = bsObj.body.h1
     except AttributeError as e:
         return None
@@ -22,5 +22,5 @@ def getTitle(url):
     print("Title could not be found")
 else:
     print(title)
-    
-    
+
+
diff --git a/chapter2/1-selectByClass.py b/chapter2/1-selectByClass.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 nameList = bsObj.findAll("span", {"class":"green"})
 for name in nameList:
-    print(name.get_text())
+    print(name.get_text())
diff --git a/chapter2/2-selectByAttribute.py b/chapter2/2-selectByAttribute.py
@@ -2,6 +2,6 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 allText = bsObj.findAll(id="text")
-print(allText[0].get_text())
+print(allText[0].get_text())
diff --git a/chapter2/3-findDescendants.py b/chapter2/3-findDescendants.py
@@ -2,7 +2,7 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 
 for child in bsObj.find("table",{"id":"giftList"}).children:
-    print(child)
+    print(child)
diff --git a/chapter2/4-findSiblings.py b/chapter2/4-findSiblings.py
@@ -1,7 +1,7 @@
 from urllib.request import urlopen
 from bs4 import BeautifulSoup
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 
 for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings:
-    print(sibling) 
+    print(sibling)
diff --git a/chapter2/5-findParents.py b/chapter2/5-findParents.py
@@ -2,5 +2,5 @@
 from bs4 import BeautifulSoup
 
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
-print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
+bsObj = BeautifulSoup(html, "html.parser")
+print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
diff --git a/chapter2/6-regularExpressions.py b/chapter2/6-regularExpressions.py
@@ -3,7 +3,7 @@
 import re
 
 html = urlopen("http://www.pythonscraping.com/pages/page3.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
-for image in images: 
+for image in images:
     print(image["src"])
diff --git a/chapter2/7-lambdaExpressions.py b/chapter2/7-lambdaExpressions.py
@@ -1,7 +1,7 @@
 from urllib.request import urlopen
 from bs4 import BeautifulSoup
 html = urlopen("http://www.pythonscraping.com/pages/page2.html")
-bsObj = BeautifulSoup(html)
+bsObj = BeautifulSoup(html, "html.parser")
 tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2)
 for tag in tags:
-	print(tag)
+	print(tag)
diff --git a/chapter3/1-getWikiLinks.py b/chapter3/1-getWikiLinks.py
@@ -7,10 +7,10 @@
 random.seed(datetime.datetime.now())
 def getLinks(articleUrl):
     html = urlopen("http://en.wikipedia.org"+articleUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
 links = getLinks("/wiki/Kevin_Bacon")
 while len(links) > 0:
     newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
     print(newArticle)
-    links = getLinks(newArticle)
+    links = getLinks(newArticle)
diff --git a/chapter3/2-crawlWikipedia.py b/chapter3/2-crawlWikipedia.py
@@ -6,14 +6,14 @@
 def getLinks(pageUrl):
     global pages
     html = urlopen("http://en.wikipedia.org"+pageUrl)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     try:
         print(bsObj.h1.get_text())
         print(bsObj.find(id ="mw-content-text").findAll("p")[0])
         print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
     except AttributeError:
         print("This page is missing something! No worries though!")
-    
+
     for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
         if 'href' in link.attrs:
             if link.attrs['href'] not in pages:
@@ -22,4 +22,4 @@ def getLinks(pageUrl):
                 print("----------------\n"+newPage)
                 pages.add(newPage)
                 getLinks(newPage)
-getLinks("") 
+getLinks("")
diff --git a/chapter3/3-crawlSite.py b/chapter3/3-crawlSite.py
@@ -16,7 +16,7 @@ def getInternalLinks(bsObj, includeUrl):
             if link.attrs['href'] not in internalLinks:
                 internalLinks.append(link.attrs['href'])
     return internalLinks
-            
+
 #Retrieves a list of all external links found on a page
 def getExternalLinks(bsObj, excludeUrl):
     externalLinks = []
@@ -34,18 +34,18 @@ def splitAddress(address):
 
 def getRandomExternalLink(startingPage):
     html = urlopen(startingPage)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
     if len(externalLinks) == 0:
-        internalLinks = getInternalLinks(startingPage)
-        return getNextExternalLink(internalLinks[random.randint(0, 
+        internalLinks = getInternalLinks(bsObj, startingPage)
+        return getExternalLinks(bsObj, internalLinks[random.randint(0,
                                   len(internalLinks)-1)])
     else:
         return externalLinks[random.randint(0, len(externalLinks)-1)]
-    
+
 def followExternalOnly(startingSite):
-    externalLink = getRandomExternalLink("http://oreilly.com")
+    externalLink = getRandomExternalLink(startingSite)
     print("Random external link is: "+externalLink)
     followExternalOnly(externalLink)
-            
-followExternalOnly("http://oreilly.com")
+
+followExternalOnly("http://oreilly.com")
diff --git a/chapter3/4-getExternalLinks.py b/chapter3/4-getExternalLinks.py
@@ -21,7 +21,7 @@ def getInternalLinks(bsObj, includeUrl):
                 else:
                     internalLinks.append(link.attrs['href'])
     return internalLinks
-            
+
 #Retrieves a list of all external links found on a page
 def getExternalLinks(bsObj, excludeUrl):
     externalLinks = []
@@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl):
 
 def getRandomExternalLink(startingPage):
     html = urlopen(startingPage)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
     if len(externalLinks) == 0:
         print("No external links, looking around the site for one")
@@ -45,7 +45,7 @@ def getRandomExternalLink(startingPage):
         return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
     else:
         return externalLinks[random.randint(0, len(externalLinks)-1)]
-    
+
 def followExternalOnly(startingSite):
     externalLink = getRandomExternalLink(startingSite)
     print("Random external link is: "+externalLink)

diff --git a/chapter3/5-getAllExternalLinks.py b/chapter3/5-getAllExternalLinks.py
@@ -21,7 +21,7 @@ def getInternalLinks(bsObj, includeUrl):
                 else:
                     internalLinks.append(link.attrs['href'])
     return internalLinks
-            
+
 #Retrieves a list of all external links found on a page
 def getExternalLinks(bsObj, excludeUrl):
     externalLinks = []
@@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl):
 
 def getRandomExternalLink(startingPage):
     html = urlopen(startingPage)
-    bsObj = BeautifulSoup(html)
+    bsObj = BeautifulSoup(html, "html.parser")
     externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
     if len(externalLinks) == 0:
         print("No external links, looking around the site for one")
@@ -45,12 +45,12 @@ def getRandomExternalLink(startingPage):
         return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
     else:
         return externalLinks[random.randint(0, len(externalLinks)-1)]
-    
+
 def followExternalOnly(startingSite):
     externalLink = getRandomExternalLink(startingSite)
     print("Random external link is: "+externalLink)
     followExternalOnly(externalLink)
-            
+
 #Collects a list of all external URLs found on the site
 allExtLinks = set()
 allIntLinks = set()

diff --git a/chapter3/scrapy/wikiSpider/wiki.log b/chapter3/scrapy/wikiSpider/wiki.log
diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc
diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/items.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/items.pyc
diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc
diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc
diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc
diff --git a/chapter4/6-wikiHistories-Chinese.py b/chapter4/6-wikiHistories-Chinese.py
@@ -0,0 +1,61 @@
+from urllib.request import urlopen
+from urllib.request import HTTPError
+from bs4 import BeautifulSoup
+import datetime
+import json
+import random
+import re
+
+random.seed(datetime.datetime.now())
+def getLinks(articleUrl):
+    html = urlopen("http://en.wikipedia.org"+articleUrl)
+    bsObj = BeautifulSoup(html, "html.parser")
+    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
+
+def getHistoryIPs(pageUrl):
+    #Format of revision history pages is:
+    #http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
+    pageUrl = pageUrl.replace("/wiki/", "")
+    historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
+    print("history url is: "+historyUrl)
+    html = urlopen(historyUrl)
+    bsObj = BeautifulSoup(html, "html.parser")
+    #finds only the links with class "mw-anonuserlink" which has IP addresses
+    #instead of usernames
+    ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
+    addressList = set()
+    for ipAddress in ipAddresses:
+        addressList.add(ipAddress.get_text())
+    return addressList
+
+
+def getCountry(ipAddress):
+    try:
+        html = urlopen("http://www.ip138.com/ips1388.asp?action=2&ip="+ipAddress).read().decode('gb2312')
+    except HTTPError:
+        return None
+    try:
+        bsObj = BeautifulSoup(html, "html.parser")
+        try:
+            response = bsObj.findAll( text=re.compile("："))[0].split("：")[2]
+        except IndexError:
+            response = bsObj.findAll( text=re.compile("数据"))[0:2]
+    except AttributeError:
+        return None
+
+    return str(response)
+
+links = getLinks("/wiki/Python_(programming_language)")
+
+
+while(len(links) > 0):
+    for link in links:
+        print("-------------------")
+        historyIPs = getHistoryIPs(link.attrs["href"])
+        for historyIP in historyIPs:
+            country = getCountry(historyIP)
+            if country is not None:
+                print(historyIP+" is from "+country)
+
+    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
+    links = getLinks(newLink)
diff --git a/chapter4/6-wikiHistories-no-locations.py b/chapter4/6-wikiHistories-no-locations.py
@@ -0,0 +1,44 @@
+from urllib.request import urlopen
+from urllib.request import HTTPError
+from bs4 import BeautifulSoup
+import datetime
+import json
+import random
+import re
+
+random.seed(datetime.datetime.now())
+def getLinks(articleUrl):
+    html = urlopen("http://en.wikipedia.org"+articleUrl)
+    bsObj = BeautifulSoup(html, "html.parser")
+    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
+
+def getHistoryIPs(pageUrl):
+    #Format of revision history pages is:
+    #http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
+    pageUrl = pageUrl.replace("/wiki/", "")
+    historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history"
+    print("history url is: "+historyUrl)
+    html = urlopen(historyUrl)
+    bsObj = BeautifulSoup(html, "html.parser")
+    #finds only the links with class "mw-anonuserlink" which has IP addresses
+    #instead of usernames
+    ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"})
+    addressList = set()
+    for ipAddress in ipAddresses:
+        addressList.add(ipAddress.get_text())
+    return addressList
+
+
+
+links = getLinks("/wiki/Python_(programming_language)")
+
+
+while(len(links) > 0):
+    for link in links:
+        print("-------------------")
+        historyIPs = getHistoryIPs(link.attrs["href"])
+        for historyIP in historyIPs:
+                print(historyIP)
+
+    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
+    links = getLinks(newLink)
diff --git a/chapter4/google-api-key.txt b/chapter4/google-api-key.txt
@@ -0,0 +1,6 @@
+AIzaSyD9Dns12MuQ0ZtLFh-fvjdlpRSavXw6lRM
+
+Usage:
+curl -d  @google-maps-geoapi-example.json  -H "Content-Type: application/json" -i "https://www.googleapis.com/geolocation/v1/geolocate?key=AIzaSyD9Dns12MuQ0ZtLFh-fvjdlpRSavXw6lRM"
+
+Warning: Replace key=[Your Key Above]
diff --git a/chapter4/google-maps-geoapi-example.json b/chapter4/google-maps-geoapi-example.json
@@ -0,0 +1,31 @@
+{
+ "homeMobileCountryCode": 310,
+ "homeMobileNetworkCode": 260,
+ "radioType": "gsm",
+ "carrier": "T-Mobile",
+ "cellTowers": [
+  {
+   "cellId": 39627456,
+   "locationAreaCode": 40495,
+   "mobileCountryCode": 310,
+   "mobileNetworkCode": 260,
+   "age": 0,
+   "signalStrength": -95
+  }
+ ],
+ "wifiAccessPoints": [
+  {
+   "macAddress": "01:23:45:67:89:AB",
+   "signalStrength": 8,
+   "age": 0,
+   "signalToNoiseRatio": -65,
+   "channel": 8
+  },
+  {
+   "macAddress": "01:23:45:67:89:AC",
+   "signalStrength": 4,
+   "age": 0
+  }
+ ]
+}
+