Source code for weblyzard_api.client.classifier

'''
Created on Jan 16, 2013

.. codeauthor: Albert Weichselbraun <albert.weichselbraun@htwchur.ch>
.. codeauthor: Norman Suesstrunk <norman.suesstrunk@htwchur.ch>
.. codeauthor: Philipp Kuntschik <philipp.kuntschik@htwchur.ch>
'''
import unittest

from eWRT.ws.rest import  MultiRESTClient
from weblyzard_api.client import WEBLYZARD_API_URL, WEBLYZARD_API_USER, WEBLYZARD_API_PASS
from sys import argv

[docs]class Classifier(MultiRESTClient): ''' **Classifier** Provides support for text classification. ''' CLASSIFIER_WS_BASE_PATH = '/joseph/rest/' def __init__(self, url=WEBLYZARD_API_URL, usr=WEBLYZARD_API_USER, pwd=WEBLYZARD_API_PASS): ''' :param url: URL of the jeremia web service :param usr: optional user name :param pwd: optional password ''' MultiRESTClient.__init__(self, service_urls=url, user=usr, password=pwd)
[docs] def hello_world(self): ''' Simple hello world test. ''' return self.request(self.CLASSIFIER_WS_BASE_PATH + 'helloworld')
[docs] def classify(self, classifier_profile, weblyzard_xml, search_agents=None, num_results=1): ''' Classify weblyzard XML documents based on the given classifier profile. :param classifier_profile: the profile to use for classification \ (e.g. 'COMET', 'MK') :param weblyzard_xml: weblyzard_xml representation of the document to \ classify :param search_agents: an optional list of search agents \ (e.g. ``[1,2,3]``) :param num_results: number of classes to return :returns: the classification result ''' classifier_request = {'xml_document': weblyzard_xml, 'numOfResults': num_results, } if search_agents is not None: classifier_request['searchAgents'] = search_agents classification_list = self.request(self.CLASSIFIER_WS_BASE_PATH + 'classify/' + classifier_profile, classifier_request) return {entry['searchagent']: entry['classification'] for entry in classification_list}
[docs] def train(self, classifier_profile, weblyzard_xml, correct_category, incorrect_category=None, document_timestamp=None): ''' Trains (and corrects) the classifier's knowledge base. :param classifier_profile: the profile to use for classification \ (e.g. 'COMET', 'MK') :param weblyzard_xml: weblyzard_xml representation of the document \ to learn :param correct_category: the correct category for the document :param incorrect_category: optional information on the incorrect \ category returned for this document :param document_timestamp: an optional timestamp, specifying when \ the document has been classified (used for retraining temporal \ knowledge bases) :returns: a response object with a status code and message. ''' learn_request = { 'xml_document': weblyzard_xml, 'category': correct_category, } if incorrect_category is not None: learn_request['oldCategory'] = incorrect_category request_type = 'retrain/' else: request_type = 'learn/' if document_timestamp is not None: learn_request['documentTimeStamp'] = document_timestamp return self.request(self.CLASSIFIER_WS_BASE_PATH + request_type + classifier_profile, learn_request)
class TestClassifier(unittest.TestCase): WEBLYZARD_XML = """<?xml version="1.0" encoding="UTF-8"?> <wl:page xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wl="http://www.weblyzard.com/wl/2013#" wl:id="1001" dc:format="text/html" xml:lang="en"> <wl:title>Hello "world" more </wl:title> <wl:body>Get in touch with Fast Track via email or Facebook. And follow us on Pinterest.1001</wl:body> <wl:sentence wl:id="26d2d0113429b0dc98352c2b5fd842a1" wl:pos="1:UH -1:' 3:NN 1:' 1:RBR " wl:token="0,5 6,7 7,12 12,13 14,18" wl:is_title="true" wl:sem_orient="0.0" wl:significance="0.0"><![CDATA[Hello "world" more]]></wl:sentence> <wl:sentence wl:id="7082ae05193c64ba5defe5e54ed15b98" wl:pos="-1:VB 0:IN 1:NN 2:IN 5:JJ 3:NNP 0:IN 6:NN 7:CC 8:NNP 0:. " wl:token="0,3 4,6 7,12 13,17 18,22 23,28 29,32 33,38 39,41 42,50 50,51" wl:sem_orient="0.0" wl:significance="0.0"><![CDATA[Get in touch with Fast Track via email or Facebook.]]></wl:sentence> <wl:sentence wl:id="e5adef7b4beb1fd4c8edd26ba1d2825c" wl:pos="1:CC -1:VB 1:PRP 1:IN 3:NNP 1:CD " wl:token="0,3 4,10 11,13 14,16 17,26 26,31" wl:sem_orient="0.0" wl:significance="0.0"><![CDATA[And follow us on Pinterest.1001]]></wl:sentence> <wl:content>Hello "world" more Get in touch with Fast Track via email or Facebook. And follow us on Pinterest.1001</wl:content> </wl:page>""" def test_submit_classify(self): ''' tests the basic submit routine ''' classifier = Classifier() search_agents = [1, 2, 3] num_results = 3 # call the web service result = classifier.classify('COMET', weblyzard_xml=self.WEBLYZARD_XML, search_agents=search_agents, num_results=num_results) # every search_agent should be covered in the result assert result.keys() == search_agents # for every search_agent are 'num_results' classes returned for _search_agent, classes in result.items(): assert len(classes) == num_results print result if __name__ == '__main__': if len(argv) == 1: unittest.main() else: from json import load fname = argv[1] with open(fname) as f: j = load(f) print Classifier().classify('COMET', weblyzard_xml=j['xml_document'], search_agents=j['searchAgents'], num_results=j['numOfResults'])