How to download pdf from a link in python
URL of the image to be downloaded is defined as image_url with open ("python.pdf", "wb") as pdf: writing one chunk at a time to pdf file. a variable and pass the url of file to be downloaded. url = ''. I am going to use the request library of python to efficiently download files url = '' r = requests.get(url.
# This is kind-of based off of this: importcookielib importurllib2 importmechanize fromtimeimportsleep importos importcgi # A routine to download a file from a link, by simulating a click on it defdownloadlink(linkUrl, referer): r=br.click_link(linkUrl) r.add_header("Referer", referer) # add a referer header, just in case #get filename from the response headers if possible'Content-Disposition') ifcdheader: value, params=cgi.parse_header(cdheader) filename=params["filename"] else: # if not use the link's basename filename=os.path.basename(linkUrl.url) f=open(filename, "w") #TODO: perhaps ensure that file doesn't already exist? f.write( # write the response content to disk printfilename," has been downloaded" br.back() # Make a Browser (think of this as chrome or firefox etc) br=mechanize.Browser() # Enable cookie support for urllib2 cookiejar=cookielib.LWPCookieJar() br.set_cookiejar( cookiejar ) # Broser options br.set_handle_equiv( True ) br.set_handle_gzip( True ) br.set_handle_redirect( True ) br.set_handle_referer( True ) br.set_handle_robots( False ) br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time=1 ) br.addheaders= [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # masquerade as a real browser. this is not nice to do though. # Open your site mypageUrl='' print"Get all PDF links\n" filetypes=["pdf", "PDF"] # pattern matching for links, can add more kinds here myfiles=[] forlinbr.links(): #check if this link has the file extension or text we want myfiles.extend([lfortinfiletypesiftinl.urlortinl.text]) forlinmyfiles: # for index, l in zip(range(100), myfiles): # <--- uncomment this line (and coment the one above) to download 100 links. #sleep(1) # uncomment to throttle downloads, so you dont hammer the site downloadlink(l, mypageUrl) Источник: []
