Source code for pyscrawl

# -*- coding: utf-8 -*-
import logging
from io import BytesIO
from os import path
from time import sleep
from zipfile import ZIP_DEFLATED, ZipFile

import requests

logger = logging.getLogger('pyscrawl')


class ScrawlError(Exception):
    def __init__(self, message, result):
        super(ScrawlError, self).__init__(message)
        self.result = result


[docs]class Scrawl(object):
    """Scrawl API service.

    This allows you to call our API service with ease.

    :param str api_key: Your Scrawl API key.
    :param str server: The Scrawl server location.

    .. code-block:: python

        from pyscrawl import Scrawl
        scrawl = Scrawl('my-api-key')

    """

    def __init__(self, api_key, server='https://scrawl.nl'):
        self.api_key = api_key
        self.server = server

[docs]    def upload_zipfile(self, title, filename, override=False):
        """Upload a new document to scrawl.

        :param str title: The title of the document.
        :param str filename: Name of the zip-file to upload.
        :param bool override: Override if document with the same name exists.
            Defaults to `False`.

        :rtype: ScrawlResultDocument

        .. code-block:: python

            result = scrawl.upload_zipfile('PyScrawl test', './document.zip')
            result.sleep_until_ready()

        """
        if not path.exists(filename):
            raise ValueError('File does not exist.')

        with open(filename, 'rb') as stream:
            return self.upload_stream(title, stream, override)

[docs]    def upload_container(self, title, container, override=False):
        """Upload a new document to scrawl using ScrawlContainer.

        You can use this to construct a zip-file on the fly.

        :param str title: The title of the document.
        :param ScrawlContainer container: The container to upload.
        :param bool override: Override if document with the same name exists.
            Defaults to `False`.

        :rtype: ScrawlResultDocument
        """
        if not isinstance(container, ScrawlContainer):
            raise ValueError('Use ScrawlContainer with `upload_container`.')

        with container as stream:
            return self.upload_stream(title, stream, override)

[docs]    def upload_stream(self, title, stream, override=False):
        """Upload a new document using a BytesIO stream.

        We recommand you use :func:`upload_zipfile` or :func:`upload_container`,
        but you can use this when you want to do something more advanced.

        :param str title: The title of the document.
        :param _IOBase stream: The stream to upload.
        :param bool override: Override if document with the same name exists.
            Defaults to `False`.

        :rtype: ScrawlResultDocument
        """
        res = requests.post(self.server + '/api/upload',
                            data={'title': title, 'override': override},
                            files={'upload': ('document.zip', stream)},
                            headers={'api-key': self.api_key})

        if res.status_code == 200:
            return ScrawlResultDocument(res.json(), self.api_key)
        else:
            raise ScrawlError('Failed to upload stream.',
                              result=res.text)


[docs]class ScrawlContainer(object):
    """Scrawl Container (in-memory ZipFile).

    This allows you to programmaticaly build a ZipFile in memory. Use
    :func:`add_file` and :func:`add_content` to build the ZipFile. When
    you are done, you can upload it by calling :meth:`Scrawl.upload_container`.

    :param str api_key: Your Scrawl API key.

    .. code-block:: python

        from pyscrawl import Scrawl, ScrawlContainer
        scrawl = Scrawl('my-api-key')

        # define an index_html using a template engine (for example)
        index_html = '''<html>...</html>'''

        # build a new document
        container = (ScrawlContainer()
                     .add_file('./styles.css')
                     .add_file('./awesome-logo.png')
                     .add_content('index.html', index_html))

        # upload to Scrawl and wait for the conversion.
        result = scrawl.upload_container('PyScrawl test', container)
        result.sleep_until_ready()

    """

    def __init__(self):
        self.filenames = list()
        self.files = dict()
        self.steam = None

[docs]    def add_file(self, filename):
        """Add a file from disk to the container.

        :param str filename: The filename on the disk.
        :rtype: ScrawlContainer
        """
        if not path.exists(filename):
            raise ValueError('File does not exist.')

        self.filenames.append(filename)
        return self

[docs]    def add_content(self, filename, content):
        """Add content to the container.

        :param str filename: How to represent the content.
        :param str content: The actual content
        :rtype: ScrawlContainer
        """
        self.files[filename] = content
        return self

    def __enter__(self):
        self.stream = self.get_stream()
        return self.stream

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.stream:
            self.stream.close()

[docs]    def get_stream(self):
        """Build a BytesIO in-memory ZipFile based on the files.

        Use :func:`add_file` and :func:`add_content` to prepare the container.
        When done, you can call :func:`get_stream` to build a in-memory ZipFile
        based on the current configuration.

        When using :meth:`Scrawl.upload_container`, this method is automaticly
        called.

        :rtype: BytesIO
        """
        stream = BytesIO()
        zf = ZipFile(stream, 'a', ZIP_DEFLATED, False)

        for filename in self.filenames:
            zf.write(filename, path.basename(filename))

        for filename, content in self.files.items():
            zf.writestr(filename, content)

        zf.close()

        stream.seek(0)
        return stream


[docs]class ScrawlResultDocument(object):
    """Scrawl Result Document.

    When calling the API, this result document is returned. It contains
    information about your uploaded document.

    See :meth:`Scrawl.upload_zipfile` or :meth:`Scrawl.upload_container` for
    more information.

    :ivar str api_key: The API key.
    :ivar str info: Information URL.
    :ivar str pdf: URL, for downloading or viewing the PDF.
    :ivar str html: URL, for online viewing the uploaded HTML document.
    :ivar bool converted: `True` when Scrawl has converted the document to PDF.
    :ivar datetime.datetime placed_on: DateTime when the document was added to
        the Scrawler convertion queue.
    :ivar datetime.datetime processed_on: DateTime when the document was
        processed by the Scrawler (`converted` is `True`, when this is set).

    """

    def __init__(self, result, api_key):
        self.api_key = api_key
        self.info = result['info']
        self.pdf = None
        self.html = None
        self.converted = False
        self.placed_on = None
        self.processed_on = None

        self.__update_with(result)

[docs]    def sleep_until_ready(self, delay=1):
        """Sleep (in delays) until Scrawl has converted the document.

        Until the :py:data:`converted` bit is set to `True`. You can always
        store the :py:data:`pdf` link, even if the conversion has not yet
        been completed.

        :param int delay: The amount of seconds to sleep (at least 1 second).
        :rtype: bool
        """
        if delay < 1:
            raise ValueError('Delay should not be lower than 1 second.')

        while self.converted is False:
            sleep(delay)
            self.__update()

        return True

    def __update(self):
        res = requests.get(self.info,
                           headers={'api-key': self.api_key})

        self.__update_with(res.json())

    def __update_with(self, json_result):
        if self.info != json_result['info']:
            raise ValueError('API returned a different info link.')

        self.pdf = json_result['pdf']
        self.html = json_result['html']

        self.converted = json_result['converted']

        self.placed_on = json_result['placed_on']
        self.processed_on = json_result['processed_on']