# CIS 192 Spring 2015 # Lecture 10: HTTP Requests and HTML Parsing # Author: Robert Rand # Date: November 5, 2015 import requests ''' Requests ''' google = 'http://www.google.com' r = requests.get(google) type(r) r.text type(r.text) r.content type(r.content) r.encoding r.headers r.status_code # Status codes cis192 = 'http://cis.upenn.edu/~cis192' r = requests.get(cis192) r.status_code r = requests.get('{}/nope'.format(cis192)) r.status_code r.raise_for_status() r.status_code == requests.codes.ok r.status_code == requests.codes.not_found r = requests.get('{}/files/hw/hw7.py'.format(cis192)) r.status_code r.raise_for_status() r.status_code == requests.codes.ok r.status_code == requests.codes.not_found # parameters docs_search = 'https://docs.python.org/2/search.html' query = '?q=itertools&check_keywords=yes&area=default' r1 = requests.get('{}{}'.format(docs_search, query)) query_params = {'q': 'itertools', 'check_keywords': 'yes', 'area': 'default'} r2 = requests.get(docs_search, params=query_params) r2.url r1.status_code r2.status_code r1.text == r2.text # POSTing post_end = 'http://httpbin.org/post' r = requests.post(post_end, data={"elm":1, "pine":2, "oak":3}) r.status_code print(r.text) r = requests.put("http://httpbin.org/put", data = {"maple":"10"}) r = requests.delete("http://httpbin.org/delete") r = requests.head("http://httpbin.org/get") r = requests.options("http://httpbin.org/get") print(r.text) # More at: http://docs.python-requests.org/en/latest/user/quickstart/ # HTML micropage = '''

This is the first paragraph

Sub paragraph

This is the second paragraph

''' ''' Beautiful Soup ''' from bs4 import BeautifulSoup doormice = """ The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" soup = BeautifulSoup(doormice, "html.parser") print(soup.prettify()) soup.title soup.title.name soup.title.string soup.title.parent.name soup.p soup.p['class'] soup.p.attrs soup.a.string soup.find_all('a') [x.string for x in soup.find_all('a')] [x['href'] for x in soup.find_all('a')] soup.find(id="link3") soup.find(id="link3")['href'] soup.text # Navigation soup.head soup.html.contents soup.html.children list(soup.html.descendants) list(soup.strings) list(soup.stripped_strings) soup.html soup.html.contents[0] soup.html.contents[0].contents[0] soup.html.contents[0].contents[0].contents[0] soup.html.contents[0].contents[0].contents[0].contents[0] soup.html.contents[0].contents[0].contents[0].next_sibling soup.html.contents[0].contents[0].contents[0].parent soup.html.contents[0].next_sibling soup.html.contents[0].next_sibling.next_sibling soup.html.contents[0].next_sibling.next_sibling.next_sibling soup.html.contents[0].next_sibling.next_sibling.contents[0] soup.html.next_element # also: previous_element soup.find_all(["a", "b"]) import re [str(x.name) for x in soup.find_all(re.compile(r'^b'))] [str(x.name) for x in soup.find_all(True)] # See more at: http://omz-software.com/pythonista/docs/ios/beautifulsoup_guide.html ''' Let's combine these things... ''' cis192 = 'http://cis.upenn.edu/~cis192' r = requests.get(cis192) r.text soup_192 = BeautifulSoup(r.text, "html.parser") print(soup_192.prettify()) [str(x['href']) for x in soup_192.find_all('a')] [str(x['href']) for x in soup_192.find(id="material").find_all('a')] reg = re.compile(r'\.pdf$') list(filter(lambda s: reg.search(s), hrefs)) def main(): pass if __name__ == '__main__': main()