Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Inačica od 00:06, 12. studenoga 2012. koju je unio/unijela 4ndY (razgovor | doprinosi)

(razl) ←Starija inačica | vidi trenutačnu inačicu (razl) | Novija inačica→ (razl)
Skoči na: orijentacija, traži

Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na ovoj adresi.

#!/usr/bin/python
# -*- coding: utf-8 -*-

###################################
# author: Andrej Dundovic
# date: 5. 2012.
# contact: andrej AT dundovic DOT com.hr
# licence: GPLv3
# description: script for croatian
# dictionary (HJP) look-up
###################################

import requests 
from BeautifulSoup import BeautifulSoup
import re, htmlentitydefs
import sys

class Word:
    """ class for fetching words from hjp.novi-liber.hr """

    def __init__( self ):
	""" init method, define some variables """
	
	self.options = 'nugdspoe' # default options (all)
	self.useful_data = ''
	self.output = ''
	self.data = []
	self.target_url = ''
	self.service_url = 'http://hjp.novi-liber.hr/'
	self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php'
	self.target_word = '';
	self.words = []
	

    def set_options( self, options ):
	""" set which parts of definition will be displayed """
	
	self.options = options

	
    def get_options( self ):
	""" get which parts of definition will be displayed """
	
	return self.options
	
	
    def search( self, word ):
	""" search for word - THE method """
	
	# look for a word
	self.target_word = word
	
	# get list of possible word autocompletion
	self.get_autocomplete()
	
	# print autocompletion list
	if len( self.words ) > 1:
	    for word in self.words:
		print word
	    print
	
	# take the first one from the list
	self.target_word = self.words[0]
	self.target_url = self.service_url + 'index.php?show=search'
	self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) )
	
	# test is the word homonym
	if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None:
	    self.multiple_definition()
		
	else:
	    self.extract_data()
    
    def get_content( self, post_data ):
	""" get page and parse useful content """
	
	req = requests.post( self.target_url, data = post_data )
	
	soup = BeautifulSoup( req.text )
	
	return str( soup.find( attrs={ "class" : "natuknica" } ) )

    
    def get_autocomplete( self ):
	""" get list of possible word autocompletion """

        get_data = { 'q': self.target_word, 'limit': 10, 's': 's' }
        req = requests.get( self.autocompl_url, params = get_data, timeout = 1 )

        self.words = req.text.split('\n')[0:-1]
    
    
    def multiple_definition( self ):
	""" word is homonym so get all homonyms """

        for choice in self.useful_data.findAll('a'):
            self.target_url = self.service_url + choice.get('href')
            self.useful_data = BeautifulSoup( self.get_content( '' ) )
            self.extract_data()
    
    
    def unescape( self, text ):
	""" fix for HTML characters """

	def fixup(m):
	    text = m.group(0)
	    if text[:2] == "&#":
		# character reference
		try:
		    if text[:3] == "&#x":
			return unichr(int(text[3:-1], 16))
		    else:
			return unichr(int(text[2:-1]))
		except ValueError:
		    pass
	    else:
		# named entity
		try:
		    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
		except KeyError:
		    pass
	    return text # leave as is
	return re.sub("&#?\w+;", fixup, text)

    
    def remove_html_tags( self, string ):
	""" removes HTML tags and convert HTML escaped chars in single character equivalent """
	
	p = re.compile(r'<.*?>')
	return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
    
    
    def extract_data( self ):
	""" extract clean data from HTML """
	
	self.data.append( dict() )
	  
	self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
	self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL
	self.data[-1]['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
	self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
	self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
	self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
	self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
	self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
	
	
    def print_html( self ):
	""" print HTML output """
	
	self.output = ''
	
	for definition in self.data:
	    
	    if 'n' in self.options:
		self.output += "<h2>" + definition['name'] + "</h2>"
	    if 'g' in self.options:
		self.output += definition['grammar']
	    if 'd' in self.options and definition['definition'] <> '':
		self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition']
	    if 's' in self.options and definition['syntagm'] <> '':
		self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm']
	    if 'f' in self.options and definition['phraseology'] <> '':
		self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology']
	    if 'o' in self.options and definition['onomastics'] <> '':
		self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics']
	    if 'e' in self.options and definition['etymology'] <> '':
		self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology']
	    if 'u' in self.options:
		self.output += "<br /><br /><b>URL</b><hr />" + definition['url']
  
	print self.output
		
    def print_clean( self ):
	""" print clean outpu (CLI) """
	
	self.output = ''
      
	for definition in self.data:
	  
	    if 'n' in options:
		self.output += u"> Rijec: " + definition['name'] + "\n"
	    if 'u' in options:
		self.output += u"> URL: " + definition['url'] + "\n"
	    if 'g' in options:
		self.output += u"> Gramatika: " + definition['grammar'] + "\n"
	    if 'd' in options:
		self.output += u"> Definicije: " + definition['definition'] + "\n"
	    if 's' in options:
		self.output += u"> Sintagma: " + definition['syntagm'] + "\n"
	    if 'f' in options:
		self.output += u"> Frazeologija: " + definition['phraseology'] + "\n"
	    if 'o' in options:
		self.output += u"> Onomastika: " + definition['onomastics'] + "\n"
	    if 'e' in options:
		self.output += u"> Etimologija: " + definition['etymology'] + "\n"
	    self.output += "\n"
	
	print self.output


if __name__ == "__main__":

    if len(sys.argv) < 2 or '-' in sys.argv[1]:
	  """ help """
	  sys.exit('Usage: %s word [options]\n'
		   'Options return only specific information about word:\n'
		   '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
		   '\t-g\tgrammar\n\t-d\tdefinition\n'
		   '\t-s\tsyntagm\n\t-p\tphraseology\n'
		   '\t-o\tonomastics\n\t-e\tetymology\n'
		   '\tMultiple combinations are allowed too;\n'
		   '\twithout any, program returns all information' % sys.argv[0])
    else:
	  query = sys.argv[1]
	  options = 'nugdspoe'

    if len(sys.argv) > 2:
	  options = ''.join( sys.argv[2:] )
    
    word = Word()
    word.set_options( options )
    word.search( query )
    word.print_clean()