Source code for weblyzard_api.client.jeremia

'''
.. codeauthor:: Albert Weichselbraun <albert.weichselbraun@htwchur.ch>
.. codeauthor:: Heinz-Peter Lang <lang@weblyzard.com>
'''
import unittest
import logging
from time import time
from sys import argv

from eWRT.ws.rest import MultiRESTClient
from weblyzard_api.xml_content import XMLContent
from weblyzard_api.client import WEBLYZARD_API_URL, WEBLYZARD_API_USER, WEBLYZARD_API_PASS

logger = logging.getLogger('weblyzard_api.client.jeremia')

[docs]class Jeremia(MultiRESTClient):
    '''
    **Jeremia Web Service**

    Pre-processes text documents and returns an annotated webLyzard XML document.

    **Blacklisting**

    Blacklisting is an optional service which removes sentences which occur
    multiple times in different documents from these documents. Examples for such
    sentences are document headers or footers.

    The following functions handle sentence blacklisting:

     * :func:`clear_blacklist`
     * :func:`get_blacklist`
     * :func:`submit_document_blacklist`
     * :func:`update_blacklist`

    Jeremia returns a 
    :doc:`webLyzard XML document <weblyzard_api.data_format.xml_format>`.
    The weblyzard_api provides the class :class:`.XMLContent` to process
    and manipulate the weblyzard XML documents.:

    .. note:: Example usage

        .. code-block:: python

            from weblyzard_api.client.recognize import Recognize
            from pprint import pprint

            docs = {'id': '192292', 
                    'title': 'The document title.', 
                    'body': 'This is the document text...', 
                    'format': 'text/html', 
                    'header': {}}
            client = Jeremia()
            result = client.submit_document(docs)
            pprint(result)
    '''
    URL_PATH = 'jeremia/rest'
    ATTRIBUTE_MAPPING = {'content_id': 'id', 
                         'title': 'title', 
                         'sentences': 'sentence',
                         'lang': 'lang',
                         'sentences_map': {'pos': 'pos',
                                           'token': 'token', 
                                           'value': 'value',
                                           'md5sum': 'id'}}
    
    def __init__(self, url=WEBLYZARD_API_URL, usr=WEBLYZARD_API_USER, pwd=WEBLYZARD_API_PASS):
        '''
        :param url: URL of the jeremia web service
        :param usr: optional user name
        :param pwd: optional password
        '''
        MultiRESTClient.__init__(self, service_urls=url, user=usr, password=pwd)


[docs]    def commit(self, batch_id, sentence_threshold=None):
        ''' 
        :param batch_id: the batch_id to retrieve 
        :return: a generator yielding all the documents of that particular batch 
        '''
        while True:
            
            path = 'commit/%s' % batch_id
            
            
            if not sentence_threshold is None:
                path = '%s?sentence_threshold=%s' % (path, sentence_threshold) 
            
            result = self.request(path)
            
            if not result:
                break
            else:
                for doc in result:
                    yield doc
                    
[docs]    def submit_document(self, document):
        '''
        processes a single document with jeremia (annotates a single document)

        :param document: the document to be processed
        '''
        return self.request('submit_document', document)
    
[docs]    def submit_documents(self, batch_id, documents):
        ''' 
        :param batch_id: batch_id to use for the given submission
        :param documents: a list of dictionaries containing the document 
        '''
        if not documents:
            raise ValueError('Cannot process an empty document list')
        return self.request('submit_documents/%s' % batch_id, documents)
    
[docs]    def status(self):
        '''
        :returns: the status of the Jeremia web service.
        '''
        return self.request('status', return_plain=True)

[docs]    def version(self):
        '''
        :returns: the current version of the jeremia deployed on the server
        '''
        return self.request('version', return_plain=True)
    
[docs]    def get_xml_doc(self, text, content_id='1'):
        '''
        Processes text and returns a XMLContent object.

        :param text: the text to process
        :param content_id: optional content id
        '''
        batch = [{'id': content_id, 
                  'title': '', 
                  'body': text, 
                  'format': 'text/plain'}]
        
        batch_id = str(time())
        self.submit_documents(batch_id, batch)
        results = list(self.commit(batch_id))
        result = results[0]
        return XMLContent(result['xml_content'])
    
[docs]    def submit_documents_blacklist(self, batch_id, documents, source_id):
        ''' submits the documents and removes blacklist sentences 

        :param batch_id: batch_id to use for the given submission
        :param documents: a list of dictionaries containing the document 
        :param source_id: source_id for the documents, determines the blacklist
        '''
        url = 'submit_documents_blacklist/%s/%s' % (batch_id, source_id)
        return self.request(url, documents)
    
[docs]    def update_blacklist(self, source_id, blacklist):
        ''' 
        updates an existing blacklist cache 

        :param source_id: the blacklist's source id
        '''
        url = 'cache/updateBlacklist/%s' % source_id
        return self.request(url, blacklist)
        
[docs]    def clear_blacklist(self, source_id):
        ''' 
        :param source_id: the blacklist's source id

        Empties the existing sentence blacklisting cache for the given source_id
        ''' 
        return self.request('cache/clearBlacklist/%s' % source_id)
        
[docs]    def get_blacklist(self, source_id):
        ''' 
        :param source_id: the blacklist's source id
        :returns: the sentence blacklist for the given source_id''' 
        return self.request('cache/getBlacklist/%s' % source_id)

[docs]    def submit(self, batch_id, documents, source_id=None, use_blacklist=False, 
               sentence_threshold=None):
        ''' Convenience function to submit documents. The function will submit
        the list of documents and finally call commit to retrieve the result

        :param batch_id: ID of the batch
        :param documents: list of documents (dict)
        :param source_id: 
        :param use_blacklist: use the blacklist or not 
        :returns: result as a list with dicts
        '''
        if use_blacklist: 
            if not source_id:
                raise Exception('Blacklist requires a source_id')
        
            url = 'submit_documents_blacklist/%s/%s' % (batch_id, source_id)
        else: 
            url = 'submit_documents/%s' % batch_id

        self.request(url, documents)
        
        return self.commit(batch_id, sentence_threshold=sentence_threshold) 

class JeremiaTest(unittest.TestCase):

    DOCS = [ {'id': content_id,
              'body': 'Good day Mr. President! Hello "world" ' + str(content_id),
              'title': 'Hello "world" more ',
              'format': 'text/html',
              'header': {}}  for content_id in xrange(1000,1020)]
    
                      
    def test_single_document_processing(self):
        j = Jeremia()
        print 'submitting document...'
        document_annotated = j.submit_document(self.DOCS[1])
        self.assertTrue(document_annotated != "")
    
    def test_single_document_with_annotations(self):
        '''
        Tests the handling of single document annotations.
        '''
        DOC = {'id'    : 12,
               'body'  : 'UBS has finally succeeded. They obtained a 10% share of CS.',
               'title' : 'UBS versus Credit Suisse.',
               'format': 'text/html',
               'title_annotation': [{'start': 0, 'end': 3, 'surfaceForm': 'UBS', 'key': 'http://dbpedia.org/UBS'},
                                    {'start':11, 'end':24, 'surfaceForm': 'Credit Suisse', 'key': 'http://dbpedia.org/Credit Suisse'}],
               'body_annotation' : [{'start': 0, 'end': 3, 'surfaceForm': 'UBS', 'key': 'http://dbpedia.org/UBS'},
                                    {'start':56, 'end':58, 'surfaceForm': 'CS', 'key': 'http://dbpedia.org/Credit Suisse'}],
               'header': {},
              }


        j = Jeremia()

        # this test requires Jeremia version 0.0.4+ 
        if j.version() < "0.0.4":
            return


        print 'submitting document with annotations...'
        result = j.submit_document(DOC)

        # check: all annotations have been preserved
        print result
        assert len(result['annotation']) == 4

        # check: annotations
        for annotation in result['annotation']:
            # title
            if annotation['md5sum'] == '8e3f3deac5e6c01dab521c07e3a60d7b':
                assert annotation['start'] == 0 or annotation['start'] == 11
                assert annotation['end'] == 3 or annotation['end'] == 24
            # first body sentence
            elif annotation['md5sum'] == 'ffafdc744dcda3d58ab6eafc86ad99b1':
                assert annotation['start'] == 0
                assert annotation['end'] == 3
            # second body sentence with adjusted indices
            elif annotation['md5sum'] == '25faaf0960a68ae741125ca436b330ee':
                assert annotation['start'] == 29
                assert annotation['end'] == 31

    def test_batch_processing(self):
        j = Jeremia()
        print 'Submitting documents...'
        j.submit_documents('1234', self.DOCS[:10])
        j.submit_documents('1234', self.DOCS[10:])
        
        # retrieve initial patch 
        print 'Retrieving results...'
        docs = list(j.commit('1234'))
        self.assertEqual(len(docs), 20)
        
        # no more results are available
        self.assertEqual(len(list(j.commit('1234'))), 0)

    def test_sentence_splitting(self):
        j = Jeremia()
        j.submit_documents( '1222', self.DOCS[:1] )

        for doc in j.commit('1222'):
            # extract sentences
            xml_obj = XMLContent(doc['xml_content'])
            sentences = [s.sentence for s in xml_obj.sentences]
            print doc['xml_content']
            assert 'wl:is_title' in doc['xml_content']
            print sentences
            
            # TODO: check sentence splitting in jeremia! 
            # self.assertEqual(len(sentences), 3)

    def test_illegal_xml_format_filtering(self):
        DOCS = [ {'id': 'alpha',
                  'body': 'This is an illegal XML Sequence: J\x1amica',
                  'title': 'Hello "world" more ',
                  'format': 'text/html',
                  'header': {}} ]

        j = Jeremia()
        j.submit_documents( '12234', DOCS )
        for doc in list(j.commit('12234')):
            xml = XMLContent(doc['xml_content'])
            print doc['xml_content']
            assert xml.sentences[0].sentence != None
       
    def test_illegal_input_args(self):
        j = Jeremia()

        with self.assertRaises(ValueError):
            j.submit_documents('1223', [])
    
    def test_submit(self):
        j = Jeremia()
        result = j.submit(batch_id='meh1234', 
                          documents=self.DOCS, 
                          use_blacklist=False)
        assert len(list(result)), 'result is empty'
        
if __name__ == '__main__':
    if len(argv) > 1:
        txt = argv[1]
        docs = {'id': '192292', 
                'body': txt, 
                'title': '', 
                'format': 'text/html', 
                'header': {}}
        j = Jeremia()
        docs['body_annotation'] = [{'start':0, 'end': 3, 'key': 'test annotation'}]
        l = j.submit_document(docs)
        print l
    else:
        unittest.main()