' imageMD.imageUrl = imgTag["src"] def make_sure_directory_exists(directory): if not os.path.isdir(directory): if os.path.exists(directory): raise DirectoryAlreadyExistsAsFile('%s' % directory) else: os.makedirs(directory) def print_stats(downloader, exploredSets, sets): print("Download completed ({nbFiles} image(s) in {exploredSets} set(s) " "[among a total of {totalSets}], {bytesTotal:.1f} MiB).".format( nbFiles=downloader.downloadedFiles, exploredSets=exploredSets, totalSets=len(sets), bytesTotal=float(downloader.downloadedBytes/2**20)), file=sys.stderr) print("Sending report to ...", end=" ", flush=True, file=sys.stderr) time.sleep(1.0) print("done.", file=sys.stderr) def imagePaths(imageMD, noext=False): assert noext or imageMD.imageUrl is not None, (noext, imageMD.imageUrl) oDir = os.path.join(params.output_dir, imageMD.setMD.name) txtFile = os.path.join(oDir, imageMD.id + ".txt") if imageMD.imageUrl is not None: u = urllib.parse.urlsplit(imageMD.imageUrl) root, ext = os.path.splitext(u.path) imgBaseName = imageMD.id + ext else: imgBaseName = imageMD.id imgFile = os.path.join(oDir, imgBaseName) return {"dir": oDir, "description": txtFile, "image": imgFile, "basename": imgBaseName} def downloadImage(imageMD, downloader, number=None, total=None): imgPaths = imagePaths(imageMD) imgFile = imgPaths["image"] imgBaseName = imgPaths["basename"] if os.path.exists(imgFile): if number is not None and total is not None: complement = " ({}/{})".format(number, total) else: complement = "" logger.info("{}{} is already present, skipping.".format(imgBaseName, complement)) else: # Download the image make_sure_directory_exists(imgPaths["dir"]) if number is not None and total is not None: complement = "{}/{} ".format(number, total) else: complement = "" logger.info("Downloading {img} {cmpl}{title}...".format( img=imgBaseName, cmpl=complement, title=imageMD.title)) tmpFile = "%s.part" % imgFile downloader.waitAndDownload(imageMD.imageUrl, tmpFile) os.rename(tmpFile, imgFile) # Write a text file containing (hopefully) a description of the # image text = imageMD.title + '\n' open(imgPaths["description"], "w").write(text) def parse_and_download(): urlOpener = urlOpenerFactories[params.urlopen_method]( retries=params.retries, retryDelay=params.retry_delay) if params.download_method == "wget": downloader = WGetDownloader( retries=params.retries, retryDelay=params.retry_delay, delayBeforeDownload=params.delay_before_download) elif params.download_method == "curl": downloader = CurlDownloader( retries=params.retries, delayBeforeDownload=params.delay_before_download) else: downloader = UrlOpenerBasedDownloader( urlOpener, delayBeforeDownload=params.delay_before_download) setsOverviewPageParser = SetsOverviewPageParser(urlOpener, params.parser) setPageParser = SetPageParser(urlOpener, params.parser) imagePageParser = ImagePageParser(urlOpener, params.parser) o = urllib.parse.urlparse(params.start_url) root_url = "{}://{}/".format(o.scheme, o.netloc) sets = setsOverviewPageParser.parse(urlOpener.openUrl(params.start_url), root_url) try: for i, setMD in enumerate(sets): logger.info( " --> Entering set {setNum}/{nbSets} ({setName})." .format(setNum=setMD.num, nbSets=len(sets), setName=setMD.name)) imagePages = setPageParser.parse(setMD) logger.info(" There are {} images in this set." .format(len(imagePages))) for j, imageMD in enumerate(imagePages): if params.quick_check: imgPaths = imagePaths(imageMD, noext=True) found = False for ext in searchedExtensionsAllCases: path = "{}.{}".format(imgPaths["image"], ext) basename = "{}.{}".format(imgPaths["basename"], ext) if os.path.exists(path): logger.info( "{} ({}/{}) is already present, skipping." .format(basename, j+1, len(imagePages))) found = True break if found: continue imagePageParser.parse(imageMD) # Now, imageMD.imageUrl contains an URL to the original-sized # image downloadImage(imageMD, downloader, number=j+1, total=len(imagePages)) except KeyboardInterrupt: print("\n", file=sys.stderr) if sets: print_stats(downloader, i+1, sets) def os_walk_dumb_error_handling(exception): raise exception def remove_partial_downloads(outputDir): if not os.path.exists(outputDir): return mode = os.stat(outputDir)[stat.ST_MODE] if stat.S_ISDIR(mode): for root, dirs, files in os.walk(outputDir, onerror=os_walk_dumb_error_handling): for f in files: if f.endswith(".part"): path = os.path.join(root, f) logger.info( "Removing leftover partial download '%s'..." % path) os.unlink(path) def process_command_line(): params = argparse.Namespace() parser = argparse.ArgumentParser( usage="""\ %(prog)s [OPTION ...] OUTPUTDIR Automatically download image files from a flickr-like web site.""", description="""\ Images and the associated descriptions are fetched from the image collection located at STARTURL and stored into subdirectories of OUTPUTDIR. The web page referenced by STARTURL must be a Sets Overview page listing the various sets in a collection, positioned at the first page of the Sets Overview.""", formatter_class=argparse.RawDescriptionHelpFormatter, # I want --help but not -h (it might be useful for something else) add_help=False) parser.add_argument('output_dir', metavar="OUTPUTDIR", help="""\ base directory for storing the images and their descriptions""") parser.add_argument('-s', '--start-url', metavar="STARTURL", default=defaultStartUrl, help="""\ base URL of the image collection to download from (default: %(default)s)""") parser.add_argument('--quick-check', type=bool, choices=("True", "False"), default="True", help="""\ when checking if we already have an image, try a well-known set of extensions {searchedExtensions} with their upper case variants instead of fetching the image page, which would be the only way to know the extension of the original image, but takes more time (default: %(default)s)""" .format(searchedExtensions=tuple(searchedExtensions))) parser.add_argument('-p', '--parser', help="""\ method to use for parsing HTML pages; this must be a valid value for the second argument of bs4.BeautifulSoup(). Possible values include 'html.parser', 'lxml', 'html5lib', but some of these will only work if the corresponding libraries are installed (default: let BeautifulSoup decide which method to use)""") parser.add_argument('-w', '--delay-before-download', type=float, default=defaultDelayBeforeDownload, help="""\ time to wait before downloading an image [will be multiplied by a random number in order to fool statistical analysis by nasty web servers] (default: %(default)s)""") parser.add_argument('--retries', type=int, default=defaultNbRetries, help="""\ number of times to retry a failed download (default: %(default)s)""") parser.add_argument('--retry-delay', type=float, default=defaultRetryDelay, help="""\ base delay in seconds before retrying a failed download [actually, most backends use an exponential backoff algorithm in order to increase the chances of recovering from the error; some backends ignore this value] (default: %(default)s)""") parser.add_argument('-D', '--download-method', choices=("urlopen", "wget", "curl"), default="urlopen", help="""\ method to use to download image files (default: %(default)s)""") parser.add_argument('-U', '--urlopen-method', choices=("urllib", "requests"), default=None, help="""\ method to use to open URLs (default: 'requests' if available, otherwise 'urllib')""") parser.add_argument('--help', action="help", help="display this message and exit") # The version text is not wrapped when using # formatter_class=argparse.RawDescriptionHelpFormatter parser.add_argument('--version', action='version', version="{name} {version}\n{blurb}".format( name=progname, version=progversion, blurb=version_blurb)) params = parser.parse_args(namespace=params) if params.urlopen_method is None: params.urlopen_method = "requests" if HAS_REQUESTS else "urllib" elif params.urlopen_method == "requests" and not HAS_REQUESTS: msg = dedent("""\ cannot use 'requests' as the urlopen method: 'requests' module is not available""") raise ExitFromCommandLineOrConfigFileParsing(msg) return params def main(): global params locale.setlocale(locale.LC_ALL, '') preferredEncoding = locale.getpreferredencoding() try: params = process_command_line() except ExitFromCommandLineOrConfigFileParsing as e: if e.message is not None: print(tw.fill("{}: {}".format(progname, e.message)), file=e.file) sys.exit(e.exit_status) try: remove_partial_downloads(params.output_dir) parse_and_download() sys.exit(0) except error as e: sys.stderr.write("Error: {}\n".format(e)) sys.exit(1) if __name__ == "__main__": main()