diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6241f04 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +cli +chapter5/downloaded/ diff --git a/chapter1/2-beautifulSoup.py b/chapter1/2-beautifulSoup.py index 1911093..9b159fc 100644 --- a/chapter1/2-beautifulSoup.py +++ b/chapter1/2-beautifulSoup.py @@ -2,5 +2,5 @@ from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/exercises/exercise1.html") -bsObj = BeautifulSoup(html.read()) +bsObj = BeautifulSoup(html.read(), "html.parser") print(bsObj.h1) diff --git a/chapter1/3-exceptionHandling.py b/chapter1/3-exceptionHandling.py index 331a7ee..65b482f 100644 --- a/chapter1/3-exceptionHandling.py +++ b/chapter1/3-exceptionHandling.py @@ -11,7 +11,7 @@ def getTitle(url): print(e) return None try: - bsObj = BeautifulSoup(html.read()) + bsObj = BeautifulSoup(html.read(), "html.parser") title = bsObj.body.h1 except AttributeError as e: return None @@ -22,5 +22,5 @@ def getTitle(url): print("Title could not be found") else: print(title) - - \ No newline at end of file + + diff --git a/chapter2/1-selectByClass.py b/chapter2/1-selectByClass.py index 7f4c489..2a90755 100644 --- a/chapter2/1-selectByClass.py +++ b/chapter2/1-selectByClass.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") nameList = bsObj.findAll("span", {"class":"green"}) for name in nameList: - print(name.get_text()) \ No newline at end of file + print(name.get_text()) diff --git a/chapter2/2-selectByAttribute.py b/chapter2/2-selectByAttribute.py index 01d9c90..e63426d 100644 --- a/chapter2/2-selectByAttribute.py +++ b/chapter2/2-selectByAttribute.py @@ -2,6 +2,6 @@ from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") allText = bsObj.findAll(id="text") -print(allText[0].get_text()) \ No newline at end of file +print(allText[0].get_text()) diff --git a/chapter2/3-findDescendants.py b/chapter2/3-findDescendants.py index 7b127de..2f4616b 100644 --- a/chapter2/3-findDescendants.py +++ b/chapter2/3-findDescendants.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page3.html") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") for child in bsObj.find("table",{"id":"giftList"}).children: - print(child) \ No newline at end of file + print(child) diff --git a/chapter2/4-findSiblings.py b/chapter2/4-findSiblings.py index c850ef1..427b4ee 100644 --- a/chapter2/4-findSiblings.py +++ b/chapter2/4-findSiblings.py @@ -1,7 +1,7 @@ from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page3.html") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") for sibling in bsObj.find("table",{"id":"giftList"}).tr.next_siblings: - print(sibling) \ No newline at end of file + print(sibling) diff --git a/chapter2/5-findParents.py b/chapter2/5-findParents.py index d0e4593..50ec5ee 100644 --- a/chapter2/5-findParents.py +++ b/chapter2/5-findParents.py @@ -2,5 +2,5 @@ from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page3.html") -bsObj = BeautifulSoup(html) -print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) \ No newline at end of file +bsObj = BeautifulSoup(html, "html.parser") +print(bsObj.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text()) diff --git a/chapter2/6-regularExpressions.py b/chapter2/6-regularExpressions.py index ef12761..285ed98 100644 --- a/chapter2/6-regularExpressions.py +++ b/chapter2/6-regularExpressions.py @@ -3,7 +3,7 @@ import re html = urlopen("http://www.pythonscraping.com/pages/page3.html") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")}) -for image in images: +for image in images: print(image["src"]) diff --git a/chapter2/7-lambdaExpressions.py b/chapter2/7-lambdaExpressions.py index 1704fa9..97a12e6 100644 --- a/chapter2/7-lambdaExpressions.py +++ b/chapter2/7-lambdaExpressions.py @@ -1,7 +1,7 @@ from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/page2.html") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") tags = bsObj.findAll(lambda tag: len(tag.attrs) == 2) for tag in tags: - print(tag) \ No newline at end of file + print(tag) diff --git a/chapter3/1-getWikiLinks.py b/chapter3/1-getWikiLinks.py index 96ca211..313832a 100644 --- a/chapter3/1-getWikiLinks.py +++ b/chapter3/1-getWikiLinks.py @@ -7,10 +7,10 @@ random.seed(datetime.datetime.now()) def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org"+articleUrl) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) links = getLinks("/wiki/Kevin_Bacon") while len(links) > 0: newArticle = links[random.randint(0, len(links)-1)].attrs["href"] print(newArticle) - links = getLinks(newArticle) \ No newline at end of file + links = getLinks(newArticle) diff --git a/chapter3/2-crawlWikipedia.py b/chapter3/2-crawlWikipedia.py index ec62d06..ecdcf87 100644 --- a/chapter3/2-crawlWikipedia.py +++ b/chapter3/2-crawlWikipedia.py @@ -6,14 +6,14 @@ def getLinks(pageUrl): global pages html = urlopen("http://en.wikipedia.org"+pageUrl) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") try: print(bsObj.h1.get_text()) print(bsObj.find(id ="mw-content-text").findAll("p")[0]) print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href']) except AttributeError: print("This page is missing something! No worries though!") - + for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")): if 'href' in link.attrs: if link.attrs['href'] not in pages: @@ -22,4 +22,4 @@ def getLinks(pageUrl): print("----------------\n"+newPage) pages.add(newPage) getLinks(newPage) -getLinks("") \ No newline at end of file +getLinks("") diff --git a/chapter3/3-crawlSite.py b/chapter3/3-crawlSite.py index f34cc47..3615342 100644 --- a/chapter3/3-crawlSite.py +++ b/chapter3/3-crawlSite.py @@ -16,7 +16,7 @@ def getInternalLinks(bsObj, includeUrl): if link.attrs['href'] not in internalLinks: internalLinks.append(link.attrs['href']) return internalLinks - + #Retrieves a list of all external links found on a page def getExternalLinks(bsObj, excludeUrl): externalLinks = [] @@ -34,18 +34,18 @@ def splitAddress(address): def getRandomExternalLink(startingPage): html = urlopen(startingPage) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0]) if len(externalLinks) == 0: - internalLinks = getInternalLinks(startingPage) - return getNextExternalLink(internalLinks[random.randint(0, + internalLinks = getInternalLinks(bsObj, startingPage) + return getExternalLinks(bsObj, internalLinks[random.randint(0, len(internalLinks)-1)]) else: return externalLinks[random.randint(0, len(externalLinks)-1)] - + def followExternalOnly(startingSite): - externalLink = getRandomExternalLink("http://oreilly.com") + externalLink = getRandomExternalLink(startingSite) print("Random external link is: "+externalLink) followExternalOnly(externalLink) - -followExternalOnly("http://oreilly.com") \ No newline at end of file + +followExternalOnly("http://oreilly.com") diff --git a/chapter3/4-getExternalLinks.py b/chapter3/4-getExternalLinks.py index 54fb854..b4fb296 100644 --- a/chapter3/4-getExternalLinks.py +++ b/chapter3/4-getExternalLinks.py @@ -21,7 +21,7 @@ def getInternalLinks(bsObj, includeUrl): else: internalLinks.append(link.attrs['href']) return internalLinks - + #Retrieves a list of all external links found on a page def getExternalLinks(bsObj, excludeUrl): externalLinks = [] @@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl): def getRandomExternalLink(startingPage): html = urlopen(startingPage) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc) if len(externalLinks) == 0: print("No external links, looking around the site for one") @@ -45,7 +45,7 @@ def getRandomExternalLink(startingPage): return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)]) else: return externalLinks[random.randint(0, len(externalLinks)-1)] - + def followExternalOnly(startingSite): externalLink = getRandomExternalLink(startingSite) print("Random external link is: "+externalLink) diff --git a/chapter3/5-getAllExternalLinks.py b/chapter3/5-getAllExternalLinks.py index c08b555..80e40f9 100644 --- a/chapter3/5-getAllExternalLinks.py +++ b/chapter3/5-getAllExternalLinks.py @@ -21,7 +21,7 @@ def getInternalLinks(bsObj, includeUrl): else: internalLinks.append(link.attrs['href']) return internalLinks - + #Retrieves a list of all external links found on a page def getExternalLinks(bsObj, excludeUrl): externalLinks = [] @@ -36,7 +36,7 @@ def getExternalLinks(bsObj, excludeUrl): def getRandomExternalLink(startingPage): html = urlopen(startingPage) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc) if len(externalLinks) == 0: print("No external links, looking around the site for one") @@ -45,12 +45,12 @@ def getRandomExternalLink(startingPage): return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)]) else: return externalLinks[random.randint(0, len(externalLinks)-1)] - + def followExternalOnly(startingSite): externalLink = getRandomExternalLink(startingSite) print("Random external link is: "+externalLink) followExternalOnly(externalLink) - + #Collects a list of all external URLs found on the site allExtLinks = set() allIntLinks = set() diff --git a/chapter3/scrapy/wikiSpider/wiki.log b/chapter3/scrapy/wikiSpider/wiki.log deleted file mode 100644 index 999a381..0000000 --- a/chapter3/scrapy/wikiSpider/wiki.log +++ /dev/null @@ -1,18 +0,0 @@ -2015-03-09 00:11:36-0400 [scrapy] INFO: Scrapy 0.24.4 started (bot: wikiSpider) -2015-03-09 00:11:36-0400 [scrapy] INFO: Optional features available: ssl, http11 -2015-03-09 00:11:36-0400 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'wikiSpider.spiders', 'SPIDER_MODULES': ['wikiSpider.spiders'], 'LOG_FILE': 'wiki.log', 'BOT_NAME': 'wikiSpider'} -2015-03-09 00:11:36-0400 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState -2015-03-09 00:11:37-0400 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats -2015-03-09 00:11:37-0400 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware -2015-03-09 00:11:37-0400 [scrapy] INFO: Enabled item pipelines: -2015-03-09 00:11:37-0400 [article] INFO: Spider opened -2015-03-09 00:11:37-0400 [article] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) -2015-03-09 00:11:37-0400 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023 -2015-03-09 00:11:37-0400 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080 -2015-03-09 00:11:37-0400 [article] DEBUG: Crawled (200) (referer: None) -2015-03-09 00:11:37-0400 [scrapy] INFO: Received SIGINT, shutting down gracefully. Send again to force -2015-03-09 00:11:37-0400 [article] INFO: Closing spider (shutdown) -2015-03-09 00:11:37-0400 [article] DEBUG: Filtered offsite request to 'en.wikibooks.org': -2015-03-09 00:11:37-0400 [article] DEBUG: Filtered offsite request to 'code.google.com': -2015-03-09 00:11:37-0400 [article] DEBUG: Filtered offsite request to 'en.wikiquote.org': -2015-03-09 00:11:37-0400 [scrapy] INFO: Received SIGINT twice, forcing unclean shutdown diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc index 2aea624..1c09f02 100644 Binary files a/chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc and b/chapter3/scrapy/wikiSpider/wikiSpider/__init__.pyc differ diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/items.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/items.pyc index d80c209..eef6e2b 100644 Binary files a/chapter3/scrapy/wikiSpider/wikiSpider/items.pyc and b/chapter3/scrapy/wikiSpider/wikiSpider/items.pyc differ diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc index c69c338..12ab1a3 100644 Binary files a/chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc and b/chapter3/scrapy/wikiSpider/wikiSpider/settings.pyc differ diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc index 78b171c..164d22b 100644 Binary files a/chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc and b/chapter3/scrapy/wikiSpider/wikiSpider/spiders/__init__.pyc differ diff --git a/chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc b/chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc index 39fe60f..fe50d8c 100644 Binary files a/chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc and b/chapter3/scrapy/wikiSpider/wikiSpider/spiders/articleSpider.pyc differ diff --git a/chapter4/6-wikiHistories-Chinese.py b/chapter4/6-wikiHistories-Chinese.py new file mode 100644 index 0000000..89169c6 --- /dev/null +++ b/chapter4/6-wikiHistories-Chinese.py @@ -0,0 +1,61 @@ +from urllib.request import urlopen +from urllib.request import HTTPError +from bs4 import BeautifulSoup +import datetime +import json +import random +import re + +random.seed(datetime.datetime.now()) +def getLinks(articleUrl): + html = urlopen("http://en.wikipedia.org"+articleUrl) + bsObj = BeautifulSoup(html, "html.parser") + return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) + +def getHistoryIPs(pageUrl): + #Format of revision history pages is: + #http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history + pageUrl = pageUrl.replace("/wiki/", "") + historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history" + print("history url is: "+historyUrl) + html = urlopen(historyUrl) + bsObj = BeautifulSoup(html, "html.parser") + #finds only the links with class "mw-anonuserlink" which has IP addresses + #instead of usernames + ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"}) + addressList = set() + for ipAddress in ipAddresses: + addressList.add(ipAddress.get_text()) + return addressList + + +def getCountry(ipAddress): + try: + html = urlopen("http://www.ip138.com/ips1388.asp?action=2&ip="+ipAddress).read().decode('gb2312') + except HTTPError: + return None + try: + bsObj = BeautifulSoup(html, "html.parser") + try: + response = bsObj.findAll( text=re.compile(":"))[0].split(":")[2] + except IndexError: + response = bsObj.findAll( text=re.compile("数据"))[0:2] + except AttributeError: + return None + + return str(response) + +links = getLinks("/wiki/Python_(programming_language)") + + +while(len(links) > 0): + for link in links: + print("-------------------") + historyIPs = getHistoryIPs(link.attrs["href"]) + for historyIP in historyIPs: + country = getCountry(historyIP) + if country is not None: + print(historyIP+" is from "+country) + + newLink = links[random.randint(0, len(links)-1)].attrs["href"] + links = getLinks(newLink) diff --git a/chapter4/6-wikiHistories-no-locations.py b/chapter4/6-wikiHistories-no-locations.py new file mode 100644 index 0000000..c727f04 --- /dev/null +++ b/chapter4/6-wikiHistories-no-locations.py @@ -0,0 +1,44 @@ +from urllib.request import urlopen +from urllib.request import HTTPError +from bs4 import BeautifulSoup +import datetime +import json +import random +import re + +random.seed(datetime.datetime.now()) +def getLinks(articleUrl): + html = urlopen("http://en.wikipedia.org"+articleUrl) + bsObj = BeautifulSoup(html, "html.parser") + return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) + +def getHistoryIPs(pageUrl): + #Format of revision history pages is: + #http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history + pageUrl = pageUrl.replace("/wiki/", "") + historyUrl = "http://en.wikipedia.org/w/index.php?title="+pageUrl+"&action=history" + print("history url is: "+historyUrl) + html = urlopen(historyUrl) + bsObj = BeautifulSoup(html, "html.parser") + #finds only the links with class "mw-anonuserlink" which has IP addresses + #instead of usernames + ipAddresses = bsObj.findAll("a", {"class":"mw-anonuserlink"}) + addressList = set() + for ipAddress in ipAddresses: + addressList.add(ipAddress.get_text()) + return addressList + + + +links = getLinks("/wiki/Python_(programming_language)") + + +while(len(links) > 0): + for link in links: + print("-------------------") + historyIPs = getHistoryIPs(link.attrs["href"]) + for historyIP in historyIPs: + print(historyIP) + + newLink = links[random.randint(0, len(links)-1)].attrs["href"] + links = getLinks(newLink) diff --git a/chapter4/google-api-key.txt b/chapter4/google-api-key.txt new file mode 100644 index 0000000..399d605 --- /dev/null +++ b/chapter4/google-api-key.txt @@ -0,0 +1,6 @@ +AIzaSyD9Dns12MuQ0ZtLFh-fvjdlpRSavXw6lRM + +Usage: +curl -d @google-maps-geoapi-example.json -H "Content-Type: application/json" -i "https://www.googleapis.com/geolocation/v1/geolocate?key=AIzaSyD9Dns12MuQ0ZtLFh-fvjdlpRSavXw6lRM" + +Warning: Replace key=[Your Key Above] diff --git a/chapter4/google-maps-geoapi-example.json b/chapter4/google-maps-geoapi-example.json new file mode 100644 index 0000000..da3ab23 --- /dev/null +++ b/chapter4/google-maps-geoapi-example.json @@ -0,0 +1,31 @@ +{ + "homeMobileCountryCode": 310, + "homeMobileNetworkCode": 260, + "radioType": "gsm", + "carrier": "T-Mobile", + "cellTowers": [ + { + "cellId": 39627456, + "locationAreaCode": 40495, + "mobileCountryCode": 310, + "mobileNetworkCode": 260, + "age": 0, + "signalStrength": -95 + } + ], + "wifiAccessPoints": [ + { + "macAddress": "01:23:45:67:89:AB", + "signalStrength": 8, + "age": 0, + "signalToNoiseRatio": -65, + "channel": 8 + }, + { + "macAddress": "01:23:45:67:89:AC", + "signalStrength": 4, + "age": 0 + } + ] +} + diff --git a/chapter5/1-getPageMedia.py b/chapter5/1-getPageMedia.py index 02869a8..db03cbf 100644 --- a/chapter5/1-getPageMedia.py +++ b/chapter5/1-getPageMedia.py @@ -12,7 +12,7 @@ def getAbsoluteURL(baseUrl, source): elif source.startswith("http://"): url = source elif source.startswith("www."): - url = source[4:] + source = source[4:] url = "http://"+source else: url = baseUrl+"/"+source @@ -32,11 +32,11 @@ def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory): return path html = urlopen("http://www.pythonscraping.com") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") downloadList = bsObj.findAll(src=True) for download in downloadList: fileUrl = getAbsoluteURL(baseUrl, download["src"]) if fileUrl is not None: print(fileUrl) - urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory)) \ No newline at end of file + urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory)) diff --git a/chapter5/3-scrapeCsv.py b/chapter5/3-scrapeCsv.py index 2c57942..607dbf4 100644 --- a/chapter5/3-scrapeCsv.py +++ b/chapter5/3-scrapeCsv.py @@ -3,12 +3,12 @@ from bs4 import BeautifulSoup html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") #The main comparison table is currently the first table on the page table = bsObj.findAll("table",{"class":"wikitable"})[0] rows = table.findAll("tr") -csvFile = open("files/editors.csv", 'wt', newline='', encoding='utf-8') +csvFile = open("../files/editors.csv", 'wt', newline='', encoding='utf-8') writer = csv.writer(csvFile) try: for row in rows: diff --git a/chapter5/4-mysqlBasicExample.py b/chapter5/4-mysqlBasicExample.py index 9c3a28d..20aa769 100644 --- a/chapter5/4-mysqlBasicExample.py +++ b/chapter5/4-mysqlBasicExample.py @@ -1,9 +1,10 @@ import pymysql -conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', - user='root', passwd=None, db='mysql') +conn = pymysql.connect(host='127.0.0.1', unix_socket='/run/mysqld/mysqld.sock',user='root', passwd=None, db='scraping') +###conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', ### invalid sock location, you can omit this parameter or refer to doc. +### user='root', passwd=None, db='mysql') ### invalid db name, before selecting db, you need to check your databases cur = conn.cursor() -cur.execute("USE scraping") +###cur.execute("USE scraping") ### no need to specify db name twice cur.execute("SELECT * FROM pages WHERE id=1") print(cur.fetchone()) cur.close() -conn.close() \ No newline at end of file +conn.close() diff --git a/chapter5/5-storeWikiLinks.py b/chapter5/5-storeWikiLinks.py index 55440be..7b8157d 100644 --- a/chapter5/5-storeWikiLinks.py +++ b/chapter5/5-storeWikiLinks.py @@ -5,9 +5,9 @@ import random import pymysql -conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd=None, db='mysql', charset='utf8') +conn = pymysql.connect(host='127.0.0.1', unix_socket='/run/mysqld/mysqld.sock', user='root', passwd=None, db='scraping', charset='utf8') cur = conn.cursor() -cur.execute("USE scraping") +### cur.execute("USE scraping") random.seed(datetime.datetime.now()) @@ -17,7 +17,7 @@ def store(title, content): def getLinks(articleUrl): html = urlopen("http://en.wikipedia.org"+articleUrl) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") title = bsObj.find("h1").get_text() content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text() store(title, content) diff --git a/chapter5/6-6DegreesCrawlWiki.py b/chapter5/6-6DegreesCrawlWiki.py index 8af29db..f1656d1 100644 --- a/chapter5/6-6DegreesCrawlWiki.py +++ b/chapter5/6-6DegreesCrawlWiki.py @@ -3,16 +3,16 @@ import pymysql from urllib.request import urlopen -conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='mysql', charset='utf8') +conn = pymysql.connect(host='127.0.0.1', user='root', passwd=None, db='wikipedia', charset='utf8') cur = conn.cursor() -cur.execute("USE wikipedia") +### cur.execute("USE wikipedia") def pageScraped(url): cur.execute("SELECT * FROM pages WHERE url = %s", (url)) if cur.rowcount == 0: return False page = cur.fetchone() - + cur.execute("SELECT * FROM links WHERE fromPageId = %s", (int(page[0]))) if cur.rowcount == 0: return False @@ -39,7 +39,7 @@ def getLinks(pageUrl, recursionLevel): return pageId = insertPageIfNotExists(pageUrl) html = urlopen("http://en.wikipedia.org"+pageUrl) - bsObj = BeautifulSoup(html) + bsObj = BeautifulSoup(html, "html.parser") for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")): insertLink(pageId, insertPageIfNotExists(link.attrs['href'])) if not pageScraped(link.attrs['href']): @@ -47,8 +47,8 @@ def getLinks(pageUrl, recursionLevel): newPage = link.attrs['href'] print(newPage) getLinks(newPage, recursionLevel+1) - else: + else: print("Skipping: "+str(link.attrs['href'])+" found on "+pageUrl) -getLinks("/wiki/Kevin_Bacon", 0) +getLinks("/wiki/Kevin_Bacon", 0) cur.close() conn.close() diff --git a/chapter5/8-sendEmailWhenChristmas.py b/chapter5/8-sendEmailWhenChristmas.py index d738ec3..037662c 100644 --- a/chapter5/8-sendEmailWhenChristmas.py +++ b/chapter5/8-sendEmailWhenChristmas.py @@ -14,7 +14,7 @@ def sendMail(subject, body): s.send_message(msg) s.quit() -bsObj = BeautifulSoup(urlopen("https://isitchristmas.com/")) +bsObj = BeautifulSoup(urlopen("https://isitchristmas.com/"), "html.parser") while(bsObj.find("a", {"id":"answer"}).attrs['title'] == "NO"): print("It is not Christmas yet.") time.sleep(3600) diff --git a/chapter6/2-getUtf8Text.py b/chapter6/2-getUtf8Text.py index 5764c3c..c47c7f6 100644 --- a/chapter6/2-getUtf8Text.py +++ b/chapter6/2-getUtf8Text.py @@ -2,8 +2,8 @@ from bs4 import BeautifulSoup html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)") -bsObj = BeautifulSoup(html) +bsObj = BeautifulSoup(html, "html.parser") content = bsObj.find("div", {"id":"mw-content-text"}).get_text() content = bytes(content, "UTF-8") content = content.decode("UTF-8") -print(content) \ No newline at end of file +print(content) diff --git a/chapter6/6-readDocx.py b/chapter6/6-readDocx.py index 203a9bd..2368852 100644 --- a/chapter6/6-readDocx.py +++ b/chapter6/6-readDocx.py @@ -8,7 +8,7 @@ document = ZipFile(wordFile) xml_content = document.read('word/document.xml') -wordObj = BeautifulSoup(xml_content.decode('utf-8')) +wordObj = BeautifulSoup(xml_content.decode('utf-8'), "html.parser") textStrings = wordObj.findAll("w:t") for textElem in textStrings: - print(textElem.text) \ No newline at end of file + print(textElem.text) diff --git a/chapter6/from urllib.request import urlopen b/chapter6/from urllib.request import urlopen deleted file mode 100644 index 52fe6b8..0000000 --- a/chapter6/from urllib.request import urlopen +++ /dev/null @@ -1,10 +0,0 @@ -from urllib.request import urlopen -from io import StringIO -import csv - -data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore') -dataFile = StringIO(data) -csvReader = csv.reader(dataFile) - -for row in csvReader: -print(row) \ No newline at end of file diff --git a/chapter6/readPdf.py b/chapter6/readPdf.py deleted file mode 100644 index c4ecee8..0000000 --- a/chapter6/readPdf.py +++ /dev/null @@ -1,24 +0,0 @@ -from pdfminer.pdfinterp import PDFResourceManager, process_pdf -from pdfminer.converter import TextConverter -from pdfminer.layout import LAParams -from io import StringIO -from io import open -from urllib.request import urlopen - -def readPDF(pdfFile): - rsrcmgr = PDFResourceManager() - retstr = StringIO() - laparams = LAParams() - device = TextConverter(rsrcmgr, retstr, laparams=laparams) - - process_pdf(rsrcmgr, device, pdfFile) - device.close() - - content = retstr.getvalue() - retstr.close() - return content - -pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") -outputString = readPDF(pdfFile) -print(outputString) -pdfFile.close()