#! /usr/bin/env python3
# -*- coding: utf-8 -*-

# flo-download-flickr-collection --- Automatic download of multiple files from
#                                    http://www.flickr.com/ (or any web site
#                                    with similar structure)
#
# Copyright (c) 2009, 2013 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA  02110-1301 USA.

# TODO: better building of absolute URLs from the values of href attributes:
# one should at least recognize absolute URLs and use the <base> tag, if any,
# to interpret relative URLs. One should also recognize scheme-less URIs such
# as //example.com/foo/bar (AKA protocol-relative URIs). [cf. RFC 3986]

import sys, os, argparse, time, stat, locale, re, subprocess, traceback, logging
import random, abc, datetime
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

try:
    import requests
except ImportError:
    HAS_REQUESTS = False
else:
    HAS_REQUESTS = True

import textwrap
from textwrap import dedent
tw = textwrap.TextWrapper(width=80, break_long_words=False,
                          break_on_hyphens=False)

defaultStartUrl = "http://www.flickr.com/photos/bibliodyssey/sets/"
defaultNbRetries = 12
defaultRetryDelay = 15     # in seconds
defaultDelayBeforeDownload = 2  # in seconds
# defaultOutputDir = os.path.expanduser("~/peacay")

# Will be set when the command line is parsed
params = None

progname = os.path.basename(sys.argv[0])
progversion = "0.7.1"
version_blurb = """Written by Florent Rougon.

Copyright (c) 2009-2013  Florent Rougon
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."""

imagePagePath_cre = re.compile(
    r'(?P<dir>/.+)/(?P<img_id>[0-9]+)/in/set-(?P<set_id>[0-9]+)/?$')
# Extensions to try when checking if we already have an image file, for the
# cases where we don't know its extension.
searchedExtensions = ("png", "gif", "jpg", "jpeg", "tiff", "svg", "pdf")
searchedExtensionsAllCases = list(searchedExtensions)
searchedExtensionsAllCases.extend([ext.upper() for ext in
                                   searchedExtensionsAllCases])

# Will be set in main(), not used anymore (we use str objects and let Python
# encode and decode with its defaults parameters)
preferredEncoding = None


def setup_logging(level=logging.NOTSET, chLevel=None):
    global logger

    if chLevel is None:
        chLevel = level

    logger = logging.getLogger(progname)
    logger.setLevel(level)
    # Create console handler and set its level
    ch = logging.StreamHandler() # Uses sys.stderr by default
    ch.setLevel(chLevel)  # NOTSET, DEBUG, INFO, WARNING, ERROR, CRITICAL
    # Logger name with :%(name)s... many other things available
    # formatter = logging.Formatter("[%(threadName)s] %(message)s")
    formatter = logging.Formatter("%(levelname)s %(message)s")
    # Add formatter to ch
    ch.setFormatter(formatter)
    # Add ch to logger
    logger.addHandler(ch)

setup_logging()
# Effective level for all child loggers with NOTSET level
logger.setLevel(logging.INFO)


# Exceptions raised by this module
class error(Exception):
    """Base class for exceptions in flo-download-flickr-collection."""
    def __init__(self, message=None):
        self.message = message

    def __str__(self):
        return self.complete_message()

    def __repr__(self):
        return "{}({!r})".format(self.__class__.__name__, self.message)

    def complete_message(self):
        if self.message:
            return "{0}: {1}".format(self.ExceptionShortDescription,
                                     self.message)
        else:
            return self.ExceptionShortDescription

    ExceptionShortDescription = "{0} generic exception".format(progname)


class ExternalProgramCannotBeExecuted(error):
    """Exception raised when an external program cannot be executed."""
    ExceptionShortDescription = "unable to execute this program"

class ExternalProgramReturnedAbnormalExitCode(error):
    """Exception raised when an external program returned an abnormal exit code."""
    def __init__(self, program, retcode):
        self.program = program
        self.retcode = retcode
        self.message = "%s, %d" % (self.program, self.retcode)

    ExceptionShortDescription = "abnormal exit code from external program"

class ExternalProgramTerminatedBySignal(error):
    """Exception raised when an external program is terminated by a signal."""
    def __init__(self, program, signum):
        self.program = program
        self.signum = signum
        self.message = "%s, %d" % (self.program, self.signum)

    ExceptionShortDescription = "program terminated by a signal"

class DirectoryAlreadyExistsAsFile(error):
    """Exception raised when failing to create a directory because the path \
already exists and is not a directory."""
    ExceptionShortDescription = "path already exists and is not a directory"

class UrlOpenError(error):
    """Exception raised when failing to open a URL."""
    ExceptionShortDescription = "unable to open URL"

class ExitFromCommandLineOrConfigFileParsing(error):
    """Exception raised to exit from command line and config file parsing"""
    def __init__(self, message=None, file=sys.stderr, exit_status=2):
        super().__init__(message)
        self.file = file
        self.exit_status = exit_status

    def __repr__(self):
        return """{classname}(
  message={msg!r},
  file={file!r},
  exit_status={status!r})""".format(classname=self.__class__.__name__,
                                    file=self.file,
                                    msg=self.message,
                                    status=self.exit_status)


class UrlOpener(metaclass=abc.ABCMeta):
    def __init__(self, retries=defaultNbRetries, retryDelay=defaultRetryDelay):
        self.retries = retries
        self.retryDelay = retryDelay

    @abc.abstractmethod
    def rawOpenUrl(self, url, **kwargs):
        """Low-level opening of 'url', without any code to retry after an error.

        Implementations must raise UrlOpenError when encountering an error
        condition that should lead to a retry (i.e., a non-fatal error).

        The method must return an object that is either a 'bytes' or
        'bytearray' instance, or a file-like object in binary mode (the two
        main users of this class are BeautifulSoup and
        UrlOpenerBasedDownloader; the former would happily accept 'str'
        instances as well, but the latter absolutely needs binary contents,
        otherwise it would be impossible to download anything other than text
        files).

        """
        raise NotImplementedError()

    def openUrl(self, url, **kwargs):
        """High-level opening of 'url', using self.retries and self.retryDelay.

        self.rawOpenUrl() is used to perform the actual opening of 'url'.

        """
        for i in range(1 + self.retries):
            if i != 0:
                # Exponential backoff
                sleepTime = self.retryDelay*2**(i-1)

                if sleepTime < 60:
                    delayStr = "{} seconds".format(sleepTime)
                else:
                    delayStr = "{} seconds (i.e., {})".format(
                        sleepTime, datetime.timedelta(seconds=sleepTime))

                try:
                    cause = lastException.__cause__
                    complement = " ({})".format(cause)
                except AttributeError:
                    complement = ""

                logger.warning(
                    "Error while trying to open URL {url}{complement}, retrying "
                    "in {delay}".format(url=url, complement=complement,
                                        delay=delayStr))
                time.sleep(sleepTime)
            try:
                fileOrBytes = self.rawOpenUrl(url, **kwargs)
            except UrlOpenError as e:
                lastException = e
                lastError = traceback.format_exc()
                continue
            else:
                return fileOrBytes

        sys.exit("{}\n{}: aborting after {} retries for URL {}.".format(
                lastError, progname, self.retries, url))


class UrllibBasedUrlOpener(UrlOpener):
    def rawOpenUrl(self, url, **kwargs):
        try:
            f = urllib.request.urlopen(url, **kwargs)
        except urllib.error.HTTPError as e:
            if e.code == 404:
                sys.exit("Fatal: got an HTTP 404 error trying to download {}"
                         .format(url))
            else:
                # Allow retrying of the failed download
                raise UrlOpenError(url) from e
        # This is the generic exception for urllib
        except urllib.error.URLError as e:
            # # Gateway Time-out, was for urllib.error.HTTPError
            # if e.code == 504:
            raise UrlOpenError(url) from e
        else:
            return f


class RequestsBasedUrlOpener(UrlOpener):
    def rawOpenUrl(self, url, **kwargs):
        try:
            r = requests.get(url, timeout=60, **kwargs)
            if r.status_code == 404:
                sys.exit("Fatal: got an HTTP 404 error trying to download {}"
                         .format(url))
            # Only raises an exception if r.status_code indicates some kind of
            # error condition (e.g., a 200 response does not cause any
            # exception to be raised).
            r.raise_for_status()
        # This is the generic exception for requests
        except requests.exceptions.RequestException as e:
            raise UrlOpenError(url) from e
        else:
            # return io.StringIO(r.content)
            #   → initial_value must be str or None, not bytes
            return r.content


urlOpenerFactories = {"urllib": UrllibBasedUrlOpener,
                      "requests": RequestsBasedUrlOpener}


class Downloader(metaclass=abc.ABCMeta):
    def __init__(self, delayBeforeDownload=defaultDelayBeforeDownload):
        self.delayBeforeDownload = delayBeforeDownload

    def wait(self):
        """Wait a bit in order to fool statiscal analysers."""
        time.sleep(random.uniform(0.75, 1.25)*self.delayBeforeDownload)

    @abc.abstractmethod
    def download(self, url, output, **kwargs):
        return

    def waitAndDownload(self, *args, **kwargs):
        self.wait()
        return self.download(*args, **kwargs)


class UrlOpenerBasedDownloader(Downloader):
    def __init__(self, urlOpener, bufSize=2**16,
                 delayBeforeDownload=defaultDelayBeforeDownload):
        super().__init__(delayBeforeDownload)
        self.urlOpener = urlOpener
        self.bufSize = bufSize
        self.downloadedBytes = 0
        self.downloadedFiles = 0

    def fileBasedDownload(self, f, output):
        """Download using a binary file-like object 'f'.

        urllib.urlopen() returns such objects, but any object providing the
        same interface as a file opened in binary mode is acceptable as 'f'.

        """
        with open(output, "wb") as of:
            # Number of bytes downloaded for this file
            downloaded_file = 0

            while True:
                chunk = f.read(self.bufSize)
                if not chunk:
                    break

                downloaded_file += self.bufSize
                self.downloadedBytes += self.bufSize

                # Impossible with logging
                # sys.stderr.write(
                #     "\r{prefix}Downloaded {bytes} bytes (total: {MiB:.1f} MiB)"
                #     .format(prefix=msgPrefix, bytes=downloaded_file,
                #             MiB=float(downloaded_total/2**20)))
                # sys.stderr.flush()

                of.write(chunk)

    def download(self, url, output):
        f = self.urlOpener.openUrl(url)

        if isinstance(f, (bytes, bytearray)):
            with open(output, "wb") as of:
                of.write(f)
            self.downloadedBytes += len(f)
        else:
            self.fileBasedDownload(f, output)

        self.downloadedFiles += 1


class ExternalDownloader(Downloader):
    def __init__(self, program, retries=defaultNbRetries,
                 retryDelay=defaultRetryDelay,
                 delayBeforeDownload=defaultDelayBeforeDownload):
        super().__init__(delayBeforeDownload)
        self.program = program
        self.retries = retries
        self.retryDelay = retryDelay
        self.downloadedBytes = 0
        self.downloadedFiles = 0

    @abc.abstractmethod
    def buildArgs(self, url, output):
        raise NotImplementedError()

    def download(self, url, output):
        args = self.buildArgs(url, output)

        try:
            p = subprocess.Popen(args, shell=False, stdout=None,
                                 stderr=None, close_fds=True)
            retcode = p.wait()
        except os.error as e:
            raise ExternalProgramCannotBeExecuted("%s: %s" % (self.program,
                                                              e.strerror))
        if retcode > 0:
    #        raise ExternalProgramReturnedAbnormalExitCode(self.program, retcode)
            logger.warning("\nNon-zero return code from {program} ({code})\n"
                           .format(program=self.program, code=retcode))
        elif retcode < 0:
            raise ExternalProgramTerminatedBySignal(self.program, -retcode)
        else:
            self.downloadedBytes += os.stat(output).st_size
            self.downloadedFiles += 1


# wget is an interesting choice because it times out and retries when the
# connection stalls, resuming the download where it stopped. In the same
# situation, urllib2 seems to get stuck forever.
class WGetDownloader(ExternalDownloader):
    def __init__(self, program="wget", **kwargs):
        super().__init__(program, **kwargs)

    def buildArgs(self, url, output):
        return [ self.program, "--tries={}".format(1+self.retries),
                 "--timeout=60",
                 "--wait={}".format(self.retryDelay),
                 "--random-wait", "-c", "-O", output, url ]


class CurlDownloader(ExternalDownloader):
    def __init__(self, program="curl", **kwargs):
        super().__init__(program, **kwargs)

    def buildArgs(self, url, output):
        return [ self.program, "--retry", str(self.retries),
                 "-o", output, url ]


def nextTagFilter(tag):
    """Used for the Sets Overview pages as well as for individual Sets pages"""
    return tag.name == "a" and \
        "href" in tag.attrs and \
        ("data-track", "next") in tag.attrs.items()


class SetMetaData:
    def __init__(self, rootUrl=None, num=None, name=None, id=None, URL=None):
        l = locals()
        for param in ("rootUrl", "num", "name", "id", "URL"):
            setattr(self, param, l[param])

class SetsOverviewPageParser:
    def __init__(self, urlOpener, parser=None):
        self.urlOpener = urlOpener
        self.parser = parser

    def parse(self, input_, rootUrl):
        pageNum = 0
        setNum = 0
        sets = []

        while True:
            pageNum += 1
            logger.info("Sets Overview page {0}".format(pageNum))
            soup = BeautifulSoup(input_, self.parser)
            # print("\n", soup.prettify(), file=sys.stderr, end="\n\n")
            tags = soup.find_all("a", class_="Seta", href=True, title=True)

            if not tags:
                logger.warning("Could not find any set on page number "
                               "{} of the Sets Overview.".format(pageNum))
                break

            for tag in tags:
                setNum += 1
                setName = tag["title"]
                logger.info("Found set number {}: '{}'...".format(
                        setNum, setName))

                pageUrl = urllib.parse.urljoin(rootUrl, tag["href"])
                sets.append(SetMetaData(rootUrl=rootUrl,
                                        num=setNum,
                                        name=setName,
                                        URL=pageUrl))

            nextTag = tag.find_next(nextTagFilter)
            if nextTag is not None:
                pageUrl = urllib.parse.urljoin(rootUrl, nextTag["href"])
                input_ = self.urlOpener.openUrl(pageUrl)
            else:
                break

        return sets


def thumbTagFilter(tag):
    return tag.name == "a" and \
        "href" in tag.attrs and \
        "title" in tag.attrs and \
        ("data-track", "photo-click") in tag.attrs.items() and \
        tag.find_all("img")


class ImageMetaData:
    def __init__(self, setMD=None, numInSet=None, id=None, title=None,
                 pageUrl=None, imageUrl=None):
        l = locals()
        for param in ("setMD", "numInSet", "id", "title", "pageUrl",
                      "imageUrl"):
            setattr(self, param, l[param])

class SetPageParser:
    def __init__(self, urlOpener, parser=None):
        self.urlOpener = urlOpener
        self.parser = parser

    def parse(self, setMD, input_=None):
        imagePages = []
        imageNum = 0
        pageNum = 0
        if input_ is None:
            input_ = self.urlOpener.openUrl(setMD.URL)

        while True:
            pageNum += 1
            soup = BeautifulSoup(input_, self.parser)
            tags = soup.find_all(thumbTagFilter)

            # for dtag in soup.find_all("div", class_="photo-display-item"):
            #     # Stop at the first matching tag
            #     tag = dtag.find(thumbTagFilter)
            if not tags:
                logger.warning("Could not find any image on page number "
                               "{} of Set {} ({}).".format(pageNum, setMD.num,
                                                            setMD.name))
                break

            for tag in tags:
                path = tag["href"]
                imageTitle = tag["title"]

                mo = imagePagePath_cre.match(path)
                if not mo:
                    sys.exit("%s: unable to parse image path '%s'"
                             % (progname, path))

                prefix = " " * 0
                imageNum += 1
                msg = "{0}Found image nb {1} ({2})...".format(
                    prefix, imageNum, imageTitle)
                logger.debug(msg)

                imagePagePath_origSize = "%s/%s/sizes/o/in/set-%s/" \
                                         % mo.group("dir", "img_id", "set_id")
                imagePageUrl = urllib.parse.urljoin(setMD.rootUrl,
                                                    imagePagePath_origSize)

                imagePages.append(ImageMetaData(setMD=setMD,
                                                numInSet=imageNum,
                                                id=mo.group("img_id"),
                                                title=imageTitle,
                                                pageUrl=imagePageUrl))

            nextTag = tag.find_next(nextTagFilter)
            if nextTag is not None:
                pageUrl = urllib.parse.urljoin(setMD.rootUrl, nextTag["href"])
                input_ = self.urlOpener.openUrl(pageUrl)
            else:
                break

        return imagePages


class ImagePageParser:
    def __init__(self, urlOpener=None, parser=None):
        self.urlOpener = urlOpener
        self.parser = parser

    def parse(self, imageMD, input_=None):
        assert not (input_ is None and self.urlOpener is None)
        if input_ is None:
            input_ = self.urlOpener.openUrl(imageMD.pageUrl)

        soup = BeautifulSoup(input_, self.parser)

        # with open(os.path.expanduser(
        #         "~/tmp/peacay.htmlcode/image_page_prettified.html"), "w") as f:
        #     print(soup.prettify(), file=f)

        tag = soup.find("div", id="allsizes-photo")
        imgTag = tag.find("img")
        assert imgTag is not None, \
            'Expected an <img> tag inside <div id="allsizes-photo">'

        imageMD.imageUrl = imgTag["src"]


def make_sure_directory_exists(directory):
    if not os.path.isdir(directory):
        if os.path.exists(directory):
            raise DirectoryAlreadyExistsAsFile('%s' % directory)
        else:
            os.makedirs(directory)


def print_stats(downloader, exploredSets, sets):
    print("Download completed ({nbFiles} image(s) in {exploredSets} set(s) "
          "[among a total of {totalSets}], {bytesTotal:.1f} MiB).".format(
          nbFiles=downloader.downloadedFiles, exploredSets=exploredSets,
          totalSets=len(sets),
          bytesTotal=float(downloader.downloadedBytes/2**20)), file=sys.stderr)
    print("Sending report to <sarko@hadopi.fr>...", end=" ", flush=True,
          file=sys.stderr)
    time.sleep(1.0)
    print("done.", file=sys.stderr)


def imagePaths(imageMD, noext=False):
    assert noext or imageMD.imageUrl is not None, (noext, imageMD.imageUrl)

    oDir = os.path.join(params.output_dir, imageMD.setMD.name)
    txtFile = os.path.join(oDir, imageMD.id + ".txt")

    if imageMD.imageUrl is not None:
        u = urllib.parse.urlsplit(imageMD.imageUrl)
        root, ext = os.path.splitext(u.path)
        imgBaseName = imageMD.id + ext
    else:
        imgBaseName = imageMD.id

    imgFile = os.path.join(oDir, imgBaseName)
    return {"dir": oDir,
            "description": txtFile,
            "image": imgFile,
            "basename": imgBaseName}


def downloadImage(imageMD, downloader, number=None, total=None):
    imgPaths = imagePaths(imageMD)
    imgFile = imgPaths["image"]
    imgBaseName = imgPaths["basename"]

    if os.path.exists(imgFile):
        if number is not None and total is not None:
            complement = " ({}/{})".format(number, total)
        else:
            complement = ""
        logger.info("{}{} is already present, skipping.".format(imgBaseName,
                                                                complement))
    else:
        # Download the image
        make_sure_directory_exists(imgPaths["dir"])

        if number is not None and total is not None:
            complement = "{}/{} ".format(number, total)
        else:
            complement = ""
        logger.info("Downloading {img} {cmpl}{title}...".format(
                img=imgBaseName, cmpl=complement, title=imageMD.title))

        tmpFile = "%s.part" % imgFile
        downloader.waitAndDownload(imageMD.imageUrl, tmpFile)
        os.rename(tmpFile, imgFile)

    # Write a text file containing (hopefully) a description of the
    # image
    text = imageMD.title + '\n'
    open(imgPaths["description"], "w").write(text)


def parse_and_download():
    urlOpener = urlOpenerFactories[params.urlopen_method](
        retries=params.retries, retryDelay=params.retry_delay)

    if params.download_method == "wget":
        downloader = WGetDownloader(
            retries=params.retries, retryDelay=params.retry_delay,
            delayBeforeDownload=params.delay_before_download)
    elif params.download_method == "curl":
        downloader = CurlDownloader(
            retries=params.retries,
            delayBeforeDownload=params.delay_before_download)
    else:
        downloader = UrlOpenerBasedDownloader(
            urlOpener,
            delayBeforeDownload=params.delay_before_download)

    setsOverviewPageParser = SetsOverviewPageParser(urlOpener, params.parser)
    setPageParser = SetPageParser(urlOpener, params.parser)
    imagePageParser = ImagePageParser(urlOpener, params.parser)

    o = urllib.parse.urlparse(params.start_url)
    root_url = "{}://{}/".format(o.scheme, o.netloc)
    sets = setsOverviewPageParser.parse(urlOpener.openUrl(params.start_url),
                                        root_url)
    try:
        for i, setMD in enumerate(sets):
            logger.info(
                "  --> Entering set {setNum}/{nbSets} ({setName})."
                .format(setNum=setMD.num, nbSets=len(sets), setName=setMD.name))
            imagePages = setPageParser.parse(setMD)
            logger.info("      There are {} images in this set."
                        .format(len(imagePages)))
            for j, imageMD in enumerate(imagePages):
                if params.quick_check:
                    imgPaths = imagePaths(imageMD, noext=True)
                    found = False

                    for ext in searchedExtensionsAllCases:
                        path = "{}.{}".format(imgPaths["image"], ext)
                        basename = "{}.{}".format(imgPaths["basename"], ext)
                        if os.path.exists(path):
                            logger.info(
                                "{} ({}/{}) is already present, skipping."
                                .format(basename, j+1, len(imagePages)))
                            found = True
                            break

                    if found:
                        continue

                imagePageParser.parse(imageMD)
                # Now, imageMD.imageUrl contains an URL to the original-sized
                # image
                downloadImage(imageMD, downloader, number=j+1,
                              total=len(imagePages))
    except KeyboardInterrupt:
        print("\n", file=sys.stderr)

    if sets:
        print_stats(downloader, i+1, sets)


def os_walk_dumb_error_handling(exception):
    raise exception


def remove_partial_downloads(outputDir):
    if not os.path.exists(outputDir):
        return

    mode = os.stat(outputDir)[stat.ST_MODE]
    if stat.S_ISDIR(mode):
        for root, dirs, files in os.walk(outputDir,
                                         onerror=os_walk_dumb_error_handling):
            for f in files:
                if f.endswith(".part"):
                    path = os.path.join(root, f)
                    logger.info(
                        "Removing leftover partial download '%s'..." % path)
                    os.unlink(path)



def process_command_line():
    params = argparse.Namespace()

    parser = argparse.ArgumentParser(
        usage="""\
%(prog)s [OPTION ...] OUTPUTDIR
Automatically download image files from a flickr-like web site.""",
        description="""\
Images and the associated descriptions are fetched from the image collection
located at STARTURL and stored into subdirectories of OUTPUTDIR.

The web page referenced by STARTURL must be a Sets Overview page
listing the various sets in a collection, positioned at the first page of
the Sets Overview.""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        # I want --help but not -h (it might be useful for something else)
        add_help=False)

    parser.add_argument('output_dir', metavar="OUTPUTDIR",
                        help="""\
      base directory for storing the images and their descriptions""")
    parser.add_argument('-s', '--start-url', metavar="STARTURL",
                        default=defaultStartUrl,
                        help="""\
      base URL of the image collection to download from
      (default: %(default)s)""")
    parser.add_argument('--quick-check', type=bool, choices=("True", "False"),
                        default="True",
                        help="""\
      when checking if we already have an image, try a well-known set of
      extensions {searchedExtensions} with their upper case variants instead
      of fetching the image page, which would be the only way to know the
      extension of the original image, but takes more time (default:
      %(default)s)"""
                        .format(searchedExtensions=tuple(searchedExtensions)))
    parser.add_argument('-p', '--parser',
                        help="""\
      method to use for parsing HTML pages; this must be a valid value
      for the second argument of bs4.BeautifulSoup(). Possible values include
      'html.parser', 'lxml', 'html5lib', but some of these will only work if
      the corresponding libraries are installed (default: let BeautifulSoup
      decide which method to use)""")
    parser.add_argument('-w', '--delay-before-download', type=float,
                        default=defaultDelayBeforeDownload,
                        help="""\
      time to wait before downloading an image [will be multiplied by a random
      number in order to fool statistical analysis by nasty web
      servers] (default: %(default)s)""")
    parser.add_argument('--retries', type=int, default=defaultNbRetries,
                        help="""\
      number of times to retry a failed download (default: %(default)s)""")
    parser.add_argument('--retry-delay', type=float, default=defaultRetryDelay,
                        help="""\
      base delay in seconds before retrying a failed download [actually, most
      backends use an exponential backoff algorithm in order to increase the
      chances of recovering from the error; some backends ignore this value]
      (default: %(default)s)""")
    parser.add_argument('-D', '--download-method', choices=("urlopen", "wget",
                                                            "curl"),
                        default="urlopen", help="""\
      method to use to download image files (default: %(default)s)""")
    parser.add_argument('-U', '--urlopen-method', choices=("urllib", "requests"),
                        default=None, help="""\
      method to use to open URLs (default: 'requests' if available, otherwise
      'urllib')""")
    parser.add_argument('--help', action="help",
                        help="display this message and exit")
    # The version text is not wrapped when using
    # formatter_class=argparse.RawDescriptionHelpFormatter
    parser.add_argument('--version', action='version',
                        version="{name} {version}\n{blurb}".format(
            name=progname, version=progversion, blurb=version_blurb))

    params = parser.parse_args(namespace=params)

    if params.urlopen_method is None:
        params.urlopen_method = "requests" if HAS_REQUESTS else "urllib"
    elif params.urlopen_method == "requests" and not HAS_REQUESTS:
        msg = dedent("""\
          cannot use 'requests' as the urlopen method: 'requests' module is not
          available""")
        raise ExitFromCommandLineOrConfigFileParsing(msg)

    return params


def main():
    global params

    locale.setlocale(locale.LC_ALL, '')
    preferredEncoding = locale.getpreferredencoding()

    try:
        params = process_command_line()
    except ExitFromCommandLineOrConfigFileParsing as e:
        if e.message is not None:
            print(tw.fill("{}: {}".format(progname, e.message)), file=e.file)
        sys.exit(e.exit_status)

    try:
        remove_partial_downloads(params.output_dir)
        parse_and_download()

        sys.exit(0)
    except error as e:
        sys.stderr.write("Error: {}\n".format(e))
        sys.exit(1)


if __name__ == "__main__": main()
