# -*- coding: utf-8 -*-
import logging
from io import BytesIO
from os import path
from time import sleep
from zipfile import ZIP_DEFLATED, ZipFile
import requests
logger = logging.getLogger('pyscrawl')
class ScrawlError(Exception):
def __init__(self, message, result):
super(ScrawlError, self).__init__(message)
self.result = result
[docs]class Scrawl(object):
"""Scrawl API service.
This allows you to call our API service with ease.
:param str api_key: Your Scrawl API key.
:param str server: The Scrawl server location.
.. code-block:: python
from pyscrawl import Scrawl
scrawl = Scrawl('my-api-key')
"""
def __init__(self, api_key, server='https://scrawl.nl'):
self.api_key = api_key
self.server = server
[docs] def upload_zipfile(self, title, filename, override=False):
"""Upload a new document to scrawl.
:param str title: The title of the document.
:param str filename: Name of the zip-file to upload.
:param bool override: Override if document with the same name exists.
Defaults to `False`.
:rtype: ScrawlResultDocument
.. code-block:: python
result = scrawl.upload_zipfile('PyScrawl test', './document.zip')
result.sleep_until_ready()
"""
if not path.exists(filename):
raise ValueError('File does not exist.')
with open(filename, 'rb') as stream:
return self.upload_stream(title, stream, override)
[docs] def upload_container(self, title, container, override=False):
"""Upload a new document to scrawl using ScrawlContainer.
You can use this to construct a zip-file on the fly.
:param str title: The title of the document.
:param ScrawlContainer container: The container to upload.
:param bool override: Override if document with the same name exists.
Defaults to `False`.
:rtype: ScrawlResultDocument
"""
if not isinstance(container, ScrawlContainer):
raise ValueError('Use ScrawlContainer with `upload_container`.')
with container as stream:
return self.upload_stream(title, stream, override)
[docs] def upload_stream(self, title, stream, override=False):
"""Upload a new document using a BytesIO stream.
We recommand you use :func:`upload_zipfile` or :func:`upload_container`,
but you can use this when you want to do something more advanced.
:param str title: The title of the document.
:param _IOBase stream: The stream to upload.
:param bool override: Override if document with the same name exists.
Defaults to `False`.
:rtype: ScrawlResultDocument
"""
res = requests.post(self.server + '/api/upload',
data={'title': title, 'override': override},
files={'upload': ('document.zip', stream)},
headers={'api-key': self.api_key})
if res.status_code == 200:
return ScrawlResultDocument(res.json(), self.api_key)
else:
raise ScrawlError('Failed to upload stream.',
result=res.text)
[docs]class ScrawlContainer(object):
"""Scrawl Container (in-memory ZipFile).
This allows you to programmaticaly build a ZipFile in memory. Use
:func:`add_file` and :func:`add_content` to build the ZipFile. When
you are done, you can upload it by calling :meth:`Scrawl.upload_container`.
:param str api_key: Your Scrawl API key.
.. code-block:: python
from pyscrawl import Scrawl, ScrawlContainer
scrawl = Scrawl('my-api-key')
# define an index_html using a template engine (for example)
index_html = '''<html>...</html>'''
# build a new document
container = (ScrawlContainer()
.add_file('./styles.css')
.add_file('./awesome-logo.png')
.add_content('index.html', index_html))
# upload to Scrawl and wait for the conversion.
result = scrawl.upload_container('PyScrawl test', container)
result.sleep_until_ready()
"""
def __init__(self):
self.filenames = list()
self.files = dict()
self.steam = None
[docs] def add_file(self, filename):
"""Add a file from disk to the container.
:param str filename: The filename on the disk.
:rtype: ScrawlContainer
"""
if not path.exists(filename):
raise ValueError('File does not exist.')
self.filenames.append(filename)
return self
[docs] def add_content(self, filename, content):
"""Add content to the container.
:param str filename: How to represent the content.
:param str content: The actual content
:rtype: ScrawlContainer
"""
self.files[filename] = content
return self
def __enter__(self):
self.stream = self.get_stream()
return self.stream
def __exit__(self, exc_type, exc_val, exc_tb):
if self.stream:
self.stream.close()
[docs] def get_stream(self):
"""Build a BytesIO in-memory ZipFile based on the files.
Use :func:`add_file` and :func:`add_content` to prepare the container.
When done, you can call :func:`get_stream` to build a in-memory ZipFile
based on the current configuration.
When using :meth:`Scrawl.upload_container`, this method is automaticly
called.
:rtype: BytesIO
"""
stream = BytesIO()
zf = ZipFile(stream, 'a', ZIP_DEFLATED, False)
for filename in self.filenames:
zf.write(filename, path.basename(filename))
for filename, content in self.files.items():
zf.writestr(filename, content)
zf.close()
stream.seek(0)
return stream
[docs]class ScrawlResultDocument(object):
"""Scrawl Result Document.
When calling the API, this result document is returned. It contains
information about your uploaded document.
See :meth:`Scrawl.upload_zipfile` or :meth:`Scrawl.upload_container` for
more information.
:ivar str api_key: The API key.
:ivar str info: Information URL.
:ivar str pdf: URL, for downloading or viewing the PDF.
:ivar str html: URL, for online viewing the uploaded HTML document.
:ivar bool converted: `True` when Scrawl has converted the document to PDF.
:ivar datetime.datetime placed_on: DateTime when the document was added to
the Scrawler convertion queue.
:ivar datetime.datetime processed_on: DateTime when the document was
processed by the Scrawler (`converted` is `True`, when this is set).
"""
def __init__(self, result, api_key):
self.api_key = api_key
self.info = result['info']
self.pdf = None
self.html = None
self.converted = False
self.placed_on = None
self.processed_on = None
self.__update_with(result)
[docs] def sleep_until_ready(self, delay=1):
"""Sleep (in delays) until Scrawl has converted the document.
Until the :py:data:`converted` bit is set to `True`. You can always
store the :py:data:`pdf` link, even if the conversion has not yet
been completed.
:param int delay: The amount of seconds to sleep (at least 1 second).
:rtype: bool
"""
if delay < 1:
raise ValueError('Delay should not be lower than 1 second.')
while self.converted is False:
sleep(delay)
self.__update()
return True
def __update(self):
res = requests.get(self.info,
headers={'api-key': self.api_key})
self.__update_with(res.json())
def __update_with(self, json_result):
if self.info != json_result['info']:
raise ValueError('API returned a different info link.')
self.pdf = json_result['pdf']
self.html = json_result['html']
self.converted = json_result['converted']
self.placed_on = json_result['placed_on']
self.processed_on = json_result['processed_on']