| # This is kind-of based off of this: http://stackoverflow.com/questions/5974595/download-all-the-linksrelated-documents-on-a-webpage-using-python |
|
| importcookielib |
| importurllib2 |
| importmechanize |
| fromtimeimportsleep |
| importos |
| importcgi |
|
| # A routine to download a file from a link, by simulating a click on it |
| defdownloadlink(linkUrl, referer): |
| r=br.click_link(linkUrl) |
| r.add_header("Referer", referer) # add a referer header, just in case |
| response=br.open(r) |
|
| #get filename from the response headers if possible |
| cdheader=response.info().getheader('Content-Disposition') |
| ifcdheader: |
| value, params=cgi.parse_header(cdheader) |
| filename=params["filename"] |
| else: |
| # if not use the link's basename |
| filename=os.path.basename(linkUrl.url) |
|
| f=open(filename, "w") #TODO: perhaps ensure that file doesn't already exist? |
| f.write(response.read()) # write the response content to disk |
| printfilename," has been downloaded" |
| br.back() |
|
| # Make a Browser (think of this as chrome or firefox etc) |
| br=mechanize.Browser() |
|
| # Enable cookie support for urllib2 |
| cookiejar=cookielib.LWPCookieJar() |
| br.set_cookiejar( cookiejar ) |
|
| # Broser options |
| br.set_handle_equiv( True ) |
| br.set_handle_gzip( True ) |
| br.set_handle_redirect( True ) |
| br.set_handle_referer( True ) |
| br.set_handle_robots( False ) |
| br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time=1 ) |
| br.addheaders= [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # masquerade as a real browser. this is not nice to do though. |
|
| # Open your site |
| mypageUrl='http://my.url.com/page' |
| br.open(mypageUrl) |
|
| print"Get all PDF links\n" |
| filetypes=["pdf", "PDF"] # pattern matching for links, can add more kinds here |
| myfiles=[] |
| forlinbr.links(): |
| #check if this link has the file extension or text we want |
| myfiles.extend([lfortinfiletypesiftinl.urlortinl.text]) |
|
| forlinmyfiles: |
| # for index, l in zip(range(100), myfiles): # <--- uncomment this line (and coment the one above) to download 100 links. |
| #sleep(1) # uncomment to throttle downloads, so you dont hammer the site |
| downloadlink(l, mypageUrl) |
0 thoughts to “How to download pdf from a link in python”