# This is kind-of based off of this: http://stackoverflow.com/questions/5974595/download-all-the-linksrelated-documents-on-a-webpage-using-python |
|
importcookielib |
importurllib2 |
importmechanize |
fromtimeimportsleep |
importos |
importcgi |
|
# A routine to download a file from a link, by simulating a click on it |
defdownloadlink(linkUrl, referer): |
r=br.click_link(linkUrl) |
r.add_header("Referer", referer) # add a referer header, just in case |
response=br.open(r) |
|
#get filename from the response headers if possible |
cdheader=response.info().getheader('Content-Disposition') |
ifcdheader: |
value, params=cgi.parse_header(cdheader) |
filename=params["filename"] |
else: |
# if not use the link's basename |
filename=os.path.basename(linkUrl.url) |
|
f=open(filename, "w") #TODO: perhaps ensure that file doesn't already exist? |
f.write(response.read()) # write the response content to disk |
printfilename," has been downloaded" |
br.back() |
|
# Make a Browser (think of this as chrome or firefox etc) |
br=mechanize.Browser() |
|
# Enable cookie support for urllib2 |
cookiejar=cookielib.LWPCookieJar() |
br.set_cookiejar( cookiejar ) |
|
# Broser options |
br.set_handle_equiv( True ) |
br.set_handle_gzip( True ) |
br.set_handle_redirect( True ) |
br.set_handle_referer( True ) |
br.set_handle_robots( False ) |
br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time=1 ) |
br.addheaders= [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # masquerade as a real browser. this is not nice to do though. |
|
# Open your site |
mypageUrl='http://my.url.com/page' |
br.open(mypageUrl) |
|
print"Get all PDF links\n" |
filetypes=["pdf", "PDF"] # pattern matching for links, can add more kinds here |
myfiles=[] |
forlinbr.links(): |
#check if this link has the file extension or text we want |
myfiles.extend([lfortinfiletypesiftinl.urlortinl.text]) |
|
forlinmyfiles: |
# for index, l in zip(range(100), myfiles): # <--- uncomment this line (and coment the one above) to download 100 links. |
#sleep(1) # uncomment to throttle downloads, so you dont hammer the site |
downloadlink(l, mypageUrl) |
0 thoughts to “How to download pdf from a link in python”