Source code for weblyzard_api.client.recognize

#!/usr/bin/python
# -*- coding: utf8 -*-
'''
.. moduleauthor:: Albert Weichselbraun <albert.weichselbraun@htwchur.ch> 
'''
import logging
import unittest
from pprint import pprint

from eWRT.access.http import Retrieve
from eWRT.ws.rest import MultiRESTClient

from weblyzard_api.xml_content import XMLContent
from weblyzard_api.client import (WEBLYZARD_API_URL, WEBLYZARD_API_USER,
                                  WEBLYZARD_API_PASS)

INTERNAL_PROFILE_PREFIX = 'extras.'
LOGGER = logging.getLogger('weblyzard_api.client.recognize')
SUPPORTED_LANGS = ('en', 'fr', 'de')

[docs]class Recognize(MultiRESTClient): ''' Provides access to the Recognize Web Service. **Workflow:** 1. pre-load the recognize profiles you need using the :func:`add_profile` call. 2. submit the text or documents to analyze using one of the following calls: * :func:`search_document` or :func:`search_documents` for document dictionaries. * :func:`search_text` for plain text. .. note:: Example usage .. code-block:: python from weblyzard_api.client.recognize import Recognize from pprint import pprint url = 'http://triple-store.ai.wu.ac.at/recognize/rest/recognize' profile_names = ['en.organization.ng', 'en.people.ng', 'en.geo.500000.ng'] text = 'Microsoft is an American multinational corporation headquartered in Redmond, Washington, that develops, manufactures, licenses, supports and sells computer software, consumer electronics and personal computers and services. It was was founded by Bill Gates and Paul Allen on April 4, 1975.' client = Recognize(url) result = client.search_text(profile_names, text, output_format='compact', max_entities=40, buckets=40, limit=40) pprint(result) ''' OUTPUT_FORMATS = ('standard', 'minimal', 'annie', 'compact') URL_PATH = 'recognize/rest/recognize' ATTRIBUTE_MAPPING = {'content_id': 'id', 'lang': 'xml:lang', 'sentences' : 'sentence', 'sentences_map': {'pos': 'pos', 'token': 'token', 'md5sum': 'id', 'value': 'value'}} def __init__(self, url=WEBLYZARD_API_URL, usr=WEBLYZARD_API_USER, pwd=WEBLYZARD_API_PASS): ''' :param url: URL of the jeremia web service :param usr: optional user name :param pwd: optional password ''' MultiRESTClient.__init__(self, service_urls=url, user=usr, password=pwd, use_random_server=True) self.profile_cache = [] @classmethod
[docs] def convert_document(cls, xml): ''' converts an XML String to a document dictionary necessary for \ transmitting the document to Recognize. :param xml: weblyzard_xml representation of the document :returns: the converted document :rtype: dict .. note:: non-sentences are ignored and titles are added based on the XmlContent's interpretation of the document. ''' if not isinstance(xml, XMLContent): xml = XMLContent(xml) return xml.as_dict(mapping=cls.ATTRIBUTE_MAPPING, ignore_non_sentence=False, add_titles_to_sentences=True)
[docs] def list_profiles(self): ''' :returns: a list of all pre-loaded profiles .. code-block:: python >>> r=Recognize() >>> r.list_profiles() [u'Cities.DACH.10000.de_en', u'People.DACH.de'] ''' return self.request('list_profiles')
[docs] def list_configured_profiles(self): ''' :returns: a list of all profiles supported in the current \ configuration ''' return self.request('list_configured_profiles')
[docs] def add_profile(self, profile_name, force=False): ''' pre-loads the given profile ::param profile_name: name of the profile to load. ''' is_internal_profile = profile_name.startswith(INTERNAL_PROFILE_PREFIX) profile_exists = profile_name in self.profile_cache and not force if not profile_exists: profile_exists = profile_name in self.list_profiles() and not force if profile_exists and not profile_name in self.profile_cache: self.profile_cache.append(profile_name) if not is_internal_profile and not profile_exists: self.profile_cache.append(profile_name) #only try to add once return self.request('add_profile/%s' % profile_name)
[docs] def get_xml_document(self, document): ''' :returns: the correct XML representation required by the Recognize \ service''' return document.xml_content.as_dict(self.ATTRIBUTE_MAPPING)
[docs] def remove_profile(self, profile_name): ''' removes a profile from the list of pre-loaded profiles ''' return self.request('remove_profile/%s' % profile_name)
[docs] def search_text(self, profile_names, text, debug=False, max_entities=1, buckets=1, limit=1, output_format='minimal'): ''' Search text for entities specified in the given profiles. :param profile_names: the profile to search in :param text: the text to search in :param debug: compute and return an explanation :param buckets: only return n buckets of hits with the same score :param max_entities: number of results to return (removes the top \ hit's tokens and rescores the result list subsequently :param limit: only return that many results :param output_format: the output format to use ('standard', \ *'minimal'*, 'annie') :rtype: the tagged text ''' assert output_format in self.OUTPUT_FORMATS if isinstance(profile_names, basestring): profile_names = (profile_names, ) for profile_name in profile_names: self.add_profile(profile_name) return self.request(path='search', parameters=text, query_parameters={'profileNames' : profile_names, 'rescore': max_entities, 'buckets': buckets, 'limit': limit, 'wt': output_format, 'debug': debug})
[docs] def search_document(self, profile_names, document, debug=False, max_entities=1, buckets=1, limit=1, output_format='minimal'): ''' :param profile_names: a list of profile names :param document: a single document to analyze (see example documents \ below) :param debug: compute and return an explanation :param buckets: only return n buckets of hits with the same score :param max_entities: number of results to return (removes the top hit's \ tokens and rescores the result list subsequently :param limit: only return that many results :param output_format: the output format to use ('standard', *'minimal'*, \ 'annie') :rtype: the tagged dictionary .. note:: Example document .. code-block:: python # option 1: document dictionary {'content_id': 12, 'content': u'the text to analyze'} # option 2: weblyzardXML XMLContent('<?xml version="1.0"...').as_list() .. note:: Corresponding web call http://localhost:8080/recognize/searchXml/ofwi.people ''' assert output_format in self.OUTPUT_FORMATS if not document: return if isinstance(profile_names, basestring): profile_names = [profile_names, ] for profile_name in profile_names: try: self.add_profile(profile_name) except Exception: profile_names.remove(profile_name) msg = 'could not load profile %s, skipping' % profile_name LOGGER.warn(msg) content_type = 'application/json' if 'content_id' in document: search_command = 'search' elif 'id' in document: search_command = 'searchXml' else: raise ValueError("Unsupported input format.") return self.request(path=search_command, parameters=document, content_type=content_type, query_parameters={'profileNames' : profile_names, 'rescore': max_entities, 'buckets': buckets, 'limit': limit, 'wt': output_format, 'debug': debug})
[docs] def search_documents(self, profile_names, doc_list, debug=False, max_entities=1, buckets=1, limit=1, output_format='annie'): ''' :param profile_names: a list of profile names :param doc_list: a list of documents to analyze (see example below) :param debug: compute and return an explanation :param buckets: only return n buckets of hits with the same score :param max_entities: number of results to return (removes the top \ hit's tokens and rescores the result list subsequently :param limit: only return that many results :param output_format: the output format to use ('standard', \ *'minimal'*, 'annie') :rtype: the tagged dictionary .. note:: Example document .. code-block:: python # option 1: list of document dictionaries ( {'content_id': 12, 'content': u'the text to analyze'}) # option 2: list of weblyzardXML dictionary representations (XMLContent('<?xml version="1.0"...').as_list(), XMLContent('<?xml version="1.0"...').as_list(),) ''' assert output_format in self.OUTPUT_FORMATS if not doc_list or len(doc_list) == 0: return if isinstance(profile_names, basestring): profile_names = (profile_names, ) profiles_to_add = [] for profile_name in profile_names: for lang in SUPPORTED_LANGS: if profile_name.startswith(lang): profiles_to_add.append(profile_name) remaining = set(profile_names).difference(set(profiles_to_add)) if len(remaining): #get all required languages from documents lang_list = [] for document in doc_list: if 'lang' in document: lang_list.append(document['lang']) lang_list = set(lang_list) #add required profiles if isinstance(profile_names, dict): for lang in lang_list: if lang in profile_names: for profile_name in profile_names[lang]: profiles_to_add.append(profile_name) else: for profile_name in profile_names: profiles_to_add.append(profile_name) #add required profiles for profile_name in set(profiles_to_add): self.add_profile(profile_name) content_type = 'application/json' if 'content_id' in doc_list[0]: search_command = 'searchDocuments' elif 'id' in doc_list[0]: search_command = 'searchXmlDocuments' else: raise ValueError("Unsupported input format.") return self.request(path=search_command, parameters=doc_list, content_type=content_type, query_parameters={ 'profileNames' : profile_names, 'rescore': max_entities, 'buckets': buckets, 'limit': limit, 'wt': output_format, 'debug': debug})
[docs] def get_focus(self, profile_names, doc_list, max_results=1): ''' :param profile_names: a list of profile names :param doc_list: a list of documents to analyze based on the \ weblyzardXML format :param max_results: maximum number of results to include :returns: the focus and annotation of the given document .. note:: Corresponding web call http://localhost:8080/recognize/focus?profiles=ofwi.people&profiles=ofwi.organizations.context ''' if isinstance(profile_names, basestring): profile_names = (profile_names, ) if not doc_list: return elif 'id' not in doc_list[0]: raise ValueError('Unsupported input format.') # add missing profiles for profile_name in profile_names: self.add_profile(profile_name) return self.request(path='focusDocuments', parameters=doc_list, query_parameters={'profiles': profile_names, 'rescore': max_results, 'buckets': max_results, 'limit': max_results})
[docs] def status(self): ''' :returns: the status of the Recognize web service. ''' return self.request(path='status')
[docs]class EntityLyzardTest(unittest.TestCase): DOCS_XML = [ ''' <?xml version="1.0" encoding="UTF-8"?> <wl:page xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wl="http://www.weblyzard.com/wl/2013#" dc:title="" wl:id="99933" dc:format="text/html" xml:lang="de" wl:nilsimsa="030472f84612acc42c7206e07814e69888267530636221300baf8bc2da66b476" dc:related="http://www.heise.de http://www.kurier.at"> <wl:sentence wl:id="50612085a00cf052d66db97ff2252544" wl:pos="NE NE VAFIN CARD NE NE VVPP $." wl:token="0,5 6,12 13,16 17,19 20,23 24,27 28,36 36,37" wl:sem_orient="0.0" wl:significance="0.0"><![CDATA[Georg Müller hat 10 Mio CHF gewonnen.]]></wl:sentence> <wl:sentence wl:id="a3b05957957e01060fd58af587427362" wl:pos="NN NE VMFIN APPR ART NN APPR CARD NE NE $, PRELS PPER NE NE VVFIN $, PIS VVINF $." wl:token="0,4 5,12 13,19 20,23 24,27 28,35 36,39 40,42 43,46 47,50 50,51 52,55 56,59 60,65 66,72 73,84 84,85 86,92 93,101 101,102" wl:sem_orient="0.0" wl:significance="0.0"><![CDATA[Herr Schmidt konnte mit dem Angebot von 10 Mio CHF, das ihm Georg Müller hinterlegte, nichts anfangen.]]></wl:sentence> </wl:page> ''', ''' <?xml version="1.0" encoding="UTF-8"?> <wl:page xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wl="http://www.weblyzard.com/wl/2013#" dc:title="" wl:id="99934" dc:format="text/html" xml:lang="de" wl:nilsimsa="020ee211a20084bb0d2208038548c02405bb0110d2183061db9400d74c15553a" dc:related="http://www.heise.de http://www.kurier.at"> <wl:sentence wl:id="f98a0c4d2ddffd60b64b9b25f1f5657a" wl:pos="NN NE VVFIN $, KOUS ART NN ADV CARD ADJD VAINF VAFIN $." wl:token="0,6 7,14 15,23 23,24 25,29 30,33 34,37 38,42 43,47 48,59 60,64 65,69 69,70" wl:sem_orient="0.0" wl:significance="0.0"><![CDATA[Rektor Kessler erklärte, dass die HTW auch 2014 erfolgreich sein wird.]]></wl:sentence> </wl:page> '''] DOCS = [Recognize.convert_document(xml) for xml in DOCS_XML] #we need to get the recognize client twice (once here and once in setUp) TESTED_PROFILES = ['de.people.ng', 'en.geo.500000.ng', 'en.organization.ng', 'en.people.ng'] IS_ONLINE = True
[docs] def setUp(self): self.available_profiles = [] self.client = Recognize() self.service_is_online = self.client.is_online() if not self.service_is_online: print 'WARNING: Webservice is offline --> not executing all tests!!' self.IS_ONLINE = False return recognize_profiles = self.client.list_profiles() for profile in recognize_profiles: if profile in self.TESTED_PROFILES: self.available_profiles.append(profile) self.all_profiles = self.client.list_profiles()
[docs] def test_missing_profiles(self): self.missing_profiles = [] if self.IS_ONLINE and self.service_is_online: if len(self.available_profiles) == len(self.TESTED_PROFILES): print "All profiles are available on the current server" else: for profile in self.TESTED_PROFILES: if profile not in self.available_profiles: self.missing_profiles.append(profile) print "Missing profiles: ", self.missing_profiles
[docs] def test_entity_lyzard(self): docs = [{'content_id': '12', 'content': u'Franz Klammer fährt Ski'}, {'content_id': '13', 'content' :u'Peter Müller macht Politik'}] required_profile = 'de.people.ng' if required_profile not in self.available_profiles: print "Profile %s not available!" % required_profile return #we test if we can add profiles and if a German profile works if self.IS_ONLINE and self.service_is_online: print self.client.list_profiles() self.client.add_profile('de.people.ng') print self.client.search_documents('de.people.ng', docs)
[docs] def test_search_xml(self): required_profile = 'de.people.ng' if required_profile not in self.available_profiles: print "Profile %s not available!" % required_profile return if self.IS_ONLINE and self.service_is_online: self.client.add_profile('de.people.ng') result = self.client.search_documents('de.people.ng', self.DOCS) print 'xmlsearch::::', result
[docs] def test_geo(self): required_profile = 'en.geo.500000.ng' if required_profile not in self.available_profiles: print "Profile %s not available!" % required_profile return geodocs = [{'content_id': '11', 'content': u'Frank goes to Los Angeles. Los Angeles is a nice city'}, ] if self.IS_ONLINE and self.service_is_online: profile_name = 'en.geo.500000.ng' print self.client.list_configured_profiles() print self.client.add_profile(profile_name, force=True) print 'list_configured_profiles', self.client.list_configured_profiles() self.client.add_profile('Cities.10000.en') self.client.search_documents(profile_name=profile_name, doc_list=geodocs, debug=True, output_format='standard') print 'list_profiles', self.client.list_profiles() self.client.add_profile('Cities.10000.en', geodocs) self.client.add_profile('Cities.10000.en') result = self.client.search_documents(profile_name, geodocs, output_format='compact') first = result['11'] print 'result', len(result), first[0]['preferredName']
[docs] def test_geo_swiss(self): ''' Tests the geo annotation service for Swiss media samples. .. note:: ``de_CH.geo.5000.ng`` detects Swiss cities with more than 5000 and worldwide cities with more than 500,000 inhabitants. ''' required_profile = 'de_CH.geo.5000.ng' if required_profile not in self.available_profiles: print("Profile %s not available!" % required_profile) return if 'noah.semanticlab.net' not in self.client.url: print("This test is only run on noah...\n...skipping test.") self.client.add_profile(required_profile)
[docs] def test_organization(self): required_profile = 'en.organization.ng' if required_profile not in self.available_profiles: print "Profile %s not available!" % required_profile return docs = [{'content_id': '14', 'content': u'Bill Gates was the CEO of Microsoft.'}, {'content_id': '15', 'content' :u'Facebook is largest social networks.'}] if self.IS_ONLINE and self.service_is_online: print self.client.list_profiles() self.client.add_profile('en.organization.ng') print self.client.search_documents('en.organization.ng', docs)
[docs] def test_people(self): required_profile = 'en.people.ng' if required_profile not in self.available_profiles: print "Profile %s not available!" % required_profile return docs = [{'content_id': '16', 'content': u'George W. Bush is a former President.'}, {'content_id': '17', 'content' :u'Mark Zuckerberg speaks Chinese.'}] if self.IS_ONLINE and self.service_is_online: print self.client.list_profiles() self.client.add_profile('en.people.ng') print self.client.search_documents('en.people.ng', docs)
[docs] def test_password(self): test_cases = ( ('http://test.net', 'test', 'password'), ('http://test.net', None, None), (['http://test.net', 'http://test2.net'], 'test', 'password'), (['http://test.net', 'http://test2.net'], None, None)) for urls, user, password in test_cases: correct_urls = Recognize.fix_urls(urls, user, password) assert isinstance(correct_urls, list) if isinstance(urls, basestring): assert len(correct_urls) == 1 else: assert len(urls) == len(correct_urls) for correct_url in correct_urls: assert correct_url.endswith(Recognize.URL_PATH) user_password = '%s:%s@' % (user, password) if user and password: assert user_password in correct_url ext_url, ext_user, ext_password = Retrieve.get_user_password(correct_url) assert ext_user == user assert ext_password == password assert user_password not in ext_url else: assert user_password not in correct_url
if __name__ == '__main__': # unittest.main() required_profile = 'de_CH.geo.5000.ng' client = Recognize() # client.remove_profile(required_profile) client.add_profile(required_profile) for text in 'Haldenstein liegt in der Nähe von Landquart.', 'Sargans hat einen wichtigen Bahnhof', 'Vinzenz arbeitet in Winterthur': result = client.search_text(required_profile, text, output_format='compact' ) print(result) required_profile = 'snf.media.criticism.project' client.add_profile(required_profile) print client.search_text(required_profile, "Die SRG und die SRF sind sehr kritisch was das Engagement der NZZ betrifft", output_format='compact') print client.search_text(required_profile, "die srg und die srf sind sehr kritisch was das engagement der nzz betrifft", output_format='compact')