Razlika između inačica stranice Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Skoči na: orijentacija, traži
m
m
 
Nije prikazano 5 međuinačica istog suradnika
Redak 1: Redak 1:
 +
'''Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na [https://gitorious.org/hjp-plasmoid/hjp-plasmoid/trees/master ovoj] adresi.'''
 +
 
<pre>
 
<pre>
 
#!/usr/bin/python
 
#!/usr/bin/python
 +
# -*- coding: utf-8 -*-
 +
 +
###################################
 
# author: Andrej Dundovic
 
# author: Andrej Dundovic
# e-mail: andrej@dundovic.com.hr
+
# date: 5. 2012.
# date: 04. 2011.
+
# contact: andrej AT dundovic DOT com.hr
 +
# licence: GPLv3
 +
# description: script for croatian
 +
# dictionary (HJP) look-up
 +
###################################
  
import urllib
+
import requests
import urllib2
+
import sys
+
 
from BeautifulSoup import BeautifulSoup
 
from BeautifulSoup import BeautifulSoup
 
import re, htmlentitydefs
 
import re, htmlentitydefs
 +
import sys
  
if len(sys.argv) < 2 or '-' in sys.argv[1]:
+
class Word:
     sys.exit('Usage: %s word [options]\n'
+
     """ class for fetching words from hjp.novi-liber.hr """
    'Options return only specific information about word:\n'
+
    '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
+
    '\t-g\tgrammar\n\t-d\tdefinition\n'
+
    '\t-s\tsyntagm\n\t-p\tphraseology\n'
+
    '\t-o\tonomastics\n\t-e\tetymology\n'
+
    '\tMultiple combinations are allowed too;\n'
+
    '\twithout any, program returns all information' % sys.argv[0])
+
else:
+
    query = sys.argv[1]
+
    options = 'nugdspoe'
+
  
if len(sys.argv) > 2:
+
    def __init__( self ):
    options = ''.join( sys.argv[2:] )
+
""" init method, define some variables """
 +
 +
self.options = 'nugdspoe' # default options (all)
 +
self.useful_data = ''
 +
self.output = ''
 +
self.data = []
 +
self.target_url = ''
 +
self.service_url = 'http://hjp.novi-liber.hr/'
 +
self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php'
 +
self.target_word = '';
 +
self.words = []
 +
  
service_url = 'http://hjp.srce.hr/index.php?show=search'
+
    def set_options( self, options ):
values = { 'word' : query }
+
""" set which parts of definition will be displayed """
data = urllib.urlencode( values )
+
 +
self.options = options
  
def get_page( service_url, data ):
+
     req = urllib2.Request( service_url, data )
+
    def get_options( self ):
    response = urllib2.urlopen( req )
+
""" get which parts of definition will be displayed """
    return response.read()
+
 +
return self.options
 +
 +
 +
    def search( self, word ):
 +
""" search for word - THE method """
 +
 +
# look for a word
 +
self.target_word = word
 +
 +
# get list of possible word autocompletion
 +
self.get_autocomplete()
 +
 +
# print autocompletion list
 +
if len( self.words ) > 1:
 +
    for word in self.words:
 +
print word
 +
    print
 +
 +
# take the first one from the list
 +
self.target_word = self.words[0]
 +
self.target_url = self.service_url + 'index.php?show=search'
 +
self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) )
 +
 +
# test is the word homonym
 +
if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None:
 +
    self.multiple_definition()
 +
 +
else:
 +
    self.extract_data()
 +
      
 +
    def get_content( self, post_data ):
 +
""" get page and parse useful content """
 +
 +
req = requests.post( self.target_url, data = post_data )
 +
 +
soup = BeautifulSoup( req.text )
 +
 +
return str( soup.find( attrs={ "class" : "natuknica" } ) )
  
def unescape(text):
+
   
     def fixup(m):
+
     def get_autocomplete( self ):
        text = m.group(0)
+
""" get list of possible word autocompletion """
        if text[:2] == "&#":
+
            # character reference
+
            try:
+
                if text[:3] == "&#x":
+
                    return unichr(int(text[3:-1], 16))
+
                else:
+
                    return unichr(int(text[2:-1]))
+
            except ValueError:
+
                pass
+
        else:
+
            # named entity
+
            try:
+
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+
            except KeyError:
+
                pass
+
        return text # leave as is
+
    return re.sub("&#?\w+;", fixup, text)
+
  
def remove_html_tags(data):
+
        get_data = { 'q': self.target_word, 'limit': 10, 's': 's' }
    p = re.compile(r'<.*?>')
+
        req = requests.get( self.autocompl_url, params = get_data, timeout = 1 )
    return p.sub('', unescape( str( data ).strip() ) )
+
  
class Word:
+
        self.words = req.text.split('\n')[0:-1]
     useful_data = ''
+
      
     data = dict()
+
   
 +
    def multiple_definition( self ):
 +
""" word is homonym so get all homonyms """
 +
 
 +
        for choice in self.useful_data.findAll('a'):
 +
            self.target_url = self.service_url + choice.get('href')
 +
            self.useful_data = BeautifulSoup( self.get_content( '' ) )
 +
            self.extract_data()
 +
      
 +
   
 +
    def unescape( self, text ):
 +
""" fix for HTML characters """
 +
 
 +
def fixup(m):
 +
    text = m.group(0)
 +
    if text[:2] == "&#":
 +
# character reference
 +
try:
 +
    if text[:3] == "&#x":
 +
return unichr(int(text[3:-1], 16))
 +
    else:
 +
return unichr(int(text[2:-1]))
 +
except ValueError:
 +
    pass
 +
    else:
 +
# named entity
 +
try:
 +
    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
 +
except KeyError:
 +
    pass
 +
    return text # leave as is
 +
return re.sub("&#?\w+;", fixup, text)
 +
 
 +
   
 +
    def remove_html_tags( self, string ):
 +
""" removes HTML tags and convert HTML escaped chars in single character equivalent """
 +
 +
p = re.compile(r'<.*?>')
 +
return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
 +
   
 
      
 
      
 
     def extract_data( self ):
 
     def extract_data( self ):
# word
+
""" extract clean data from HTML """
self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) )
+
# word URL
+
self.data.append( dict() )
self.data['word_url'] = self.useful_data.find('a')['href']
+
 
# grammar info
+
self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) )
+
self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL
# definition
+
self.data[-1]['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) )
+
self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
# sintagm
+
self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) )
+
self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
# phraseology
+
self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) )
+
self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
# onomastics
+
self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) )
+
# etymology
+
    def print_html( self ):
self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) )
+
""" print HTML output """
 +
 +
self.output = ''
 +
 +
for definition in self.data:
 +
   
 +
    if 'n' in self.options:
 +
self.output += "<h2>" + definition['name'] + "</h2>"
 +
    if 'g' in self.options:
 +
self.output += definition['grammar']
 +
    if 'd' in self.options and definition['definition'] <> '':
 +
self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition']
 +
    if 's' in self.options and definition['syntagm'] <> '':
 +
self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm']
 +
    if 'f' in self.options and definition['phraseology'] <> '':
 +
self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology']
 +
    if 'o' in self.options and definition['onomastics'] <> '':
 +
self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics']
 +
    if 'e' in self.options and definition['etymology'] <> '':
 +
self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology']
 +
    if 'u' in self.options:
 +
self.output += "<br /><br /><b>URL</b><hr />" + definition['url']
 
    
 
    
     def __init__( self, raw_data ):
+
print self.output
self.useful_data = BeautifulSoup( raw_data )
+
self.extract_data()
+
     def print_clean( self ):
 
+
""" print clean outpu (CLI) """
    def print_all( self ):
+
if 'n' in options:
+
self.output = ''
    print u"> Rijec: ", self.data['name']
+
     
if 'u' in options:
+
for definition in self.data:
    print u"> URL: ", self.data['word_url']
+
 
if 'g' in options:
+
    if 'n' in options:
    print u"> Gramatika: ", self.data['grammar']
+
self.output += u"> Rijec: " + definition['name'] + "\n"
if 'd' in options:
+
    if 'u' in options:
    print u"> Definicije: ", self.data['definition']
+
self.output += u"> URL: " + definition['url'] + "\n"
if 's' in options:
+
    if 'g' in options:
    print u"> Sintagma: ", self.data['syntagm']
+
self.output += u"> Gramatika: " + definition['grammar'] + "\n"
if 'f' in options:
+
    if 'd' in options:
    print u"> Frazeologija: ", self.data['phraseology']
+
self.output += u"> Definicije: " + definition['definition'] + "\n"
if 'o' in options:
+
    if 's' in options:
    print u"> Onomastika: ", self.data['onomastics']
+
self.output += u"> Sintagma: " + definition['syntagm'] + "\n"
if 'e' in options:
+
    if 'f' in options:
    print u"> Etimologija: ", self.data['etymology']
+
self.output += u"> Frazeologija: " + definition['phraseology'] + "\n"
 +
    if 'o' in options:
 +
self.output += u"> Onomastika: " + definition['onomastics'] + "\n"
 +
    if 'e' in options:
 +
self.output += u"> Etimologija: " + definition['etymology'] + "\n"
 +
    self.output += "\n"
 
 
 +
print self.output
  
soup = BeautifulSoup( get_page( service_url, data ) )
 
subpage = str( soup.find( attrs={ "class" : "natuknica" } ) )
 
  
if ( subpage == 'None' ):
+
if __name__ == "__main__":
    print "Word not found"
+
 
else:
+
     if len(sys.argv) < 2 or '-' in sys.argv[1]:
    word = Word( subpage )
+
  """ help """
     if ( word.data['name'] == 'None' ):
+
  sys.exit('Usage: %s word [options]\n'
""" multiple choice """
+
  'Options return only specific information about word:\n'
word_url = word.useful_data.findAll('td')
+
  '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
for choice in word_url[2:]:
+
  '\t-g\tgrammar\n\t-d\tdefinition\n'
    link = BeautifulSoup( str( choice ) )
+
  '\t-s\tsyntagm\n\t-p\tphraseology\n'
    url = "http://hjp.srce.hr/"+link.find('a')['href']
+
  '\t-o\tonomastics\n\t-e\tetymology\n'
    soup_tmp = BeautifulSoup( get_page( url, '' ) )
+
  '\tMultiple combinations are allowed too;\n'
    word = Word( str( soup_tmp.find( attrs={ "class" : "natuknica" } ) ) )
+
  '\twithout any, program returns all information' % sys.argv[0])
    word.print_all()
+
 
     else:
 
     else:
""" single choice """
+
  query = sys.argv[1]
word.print_all()
+
  options = 'nugdspoe'
 +
 
 +
    if len(sys.argv) > 2:
 +
  options = ''.join( sys.argv[2:] )
 +
   
 +
    word = Word()
 +
    word.set_options( options )
 +
    word.search( query )
 +
    word.print_clean()
 +
 
 
</pre>
 
</pre>

Trenutačna izmjena od 00:06, 12. studenoga 2012.

Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na ovoj adresi.

#!/usr/bin/python
# -*- coding: utf-8 -*-

###################################
# author: Andrej Dundovic
# date: 5. 2012.
# contact: andrej AT dundovic DOT com.hr
# licence: GPLv3
# description: script for croatian
# dictionary (HJP) look-up
###################################

import requests 
from BeautifulSoup import BeautifulSoup
import re, htmlentitydefs
import sys

class Word:
    """ class for fetching words from hjp.novi-liber.hr """

    def __init__( self ):
	""" init method, define some variables """
	
	self.options = 'nugdspoe' # default options (all)
	self.useful_data = ''
	self.output = ''
	self.data = []
	self.target_url = ''
	self.service_url = 'http://hjp.novi-liber.hr/'
	self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php'
	self.target_word = '';
	self.words = []
	

    def set_options( self, options ):
	""" set which parts of definition will be displayed """
	
	self.options = options

	
    def get_options( self ):
	""" get which parts of definition will be displayed """
	
	return self.options
	
	
    def search( self, word ):
	""" search for word - THE method """
	
	# look for a word
	self.target_word = word
	
	# get list of possible word autocompletion
	self.get_autocomplete()
	
	# print autocompletion list
	if len( self.words ) > 1:
	    for word in self.words:
		print word
	    print
	
	# take the first one from the list
	self.target_word = self.words[0]
	self.target_url = self.service_url + 'index.php?show=search'
	self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) )
	
	# test is the word homonym
	if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None:
	    self.multiple_definition()
		
	else:
	    self.extract_data()
    
    def get_content( self, post_data ):
	""" get page and parse useful content """
	
	req = requests.post( self.target_url, data = post_data )
	
	soup = BeautifulSoup( req.text )
	
	return str( soup.find( attrs={ "class" : "natuknica" } ) )

    
    def get_autocomplete( self ):
	""" get list of possible word autocompletion """

        get_data = { 'q': self.target_word, 'limit': 10, 's': 's' }
        req = requests.get( self.autocompl_url, params = get_data, timeout = 1 )

        self.words = req.text.split('\n')[0:-1]
    
    
    def multiple_definition( self ):
	""" word is homonym so get all homonyms """

        for choice in self.useful_data.findAll('a'):
            self.target_url = self.service_url + choice.get('href')
            self.useful_data = BeautifulSoup( self.get_content( '' ) )
            self.extract_data()
    
    
    def unescape( self, text ):
	""" fix for HTML characters """

	def fixup(m):
	    text = m.group(0)
	    if text[:2] == "&#":
		# character reference
		try:
		    if text[:3] == "&#x":
			return unichr(int(text[3:-1], 16))
		    else:
			return unichr(int(text[2:-1]))
		except ValueError:
		    pass
	    else:
		# named entity
		try:
		    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
		except KeyError:
		    pass
	    return text # leave as is
	return re.sub("&#?\w+;", fixup, text)

    
    def remove_html_tags( self, string ):
	""" removes HTML tags and convert HTML escaped chars in single character equivalent """
	
	p = re.compile(r'<.*?>')
	return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
    
    
    def extract_data( self ):
	""" extract clean data from HTML """
	
	self.data.append( dict() )
	  
	self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
	self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL
	self.data[-1]['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
	self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
	self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
	self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
	self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
	self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
	
	
    def print_html( self ):
	""" print HTML output """
	
	self.output = ''
	
	for definition in self.data:
	    
	    if 'n' in self.options:
		self.output += "<h2>" + definition['name'] + "</h2>"
	    if 'g' in self.options:
		self.output += definition['grammar']
	    if 'd' in self.options and definition['definition'] <> '':
		self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition']
	    if 's' in self.options and definition['syntagm'] <> '':
		self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm']
	    if 'f' in self.options and definition['phraseology'] <> '':
		self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology']
	    if 'o' in self.options and definition['onomastics'] <> '':
		self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics']
	    if 'e' in self.options and definition['etymology'] <> '':
		self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology']
	    if 'u' in self.options:
		self.output += "<br /><br /><b>URL</b><hr />" + definition['url']
  
	print self.output
		
    def print_clean( self ):
	""" print clean outpu (CLI) """
	
	self.output = ''
      
	for definition in self.data:
	  
	    if 'n' in options:
		self.output += u"> Rijec: " + definition['name'] + "\n"
	    if 'u' in options:
		self.output += u"> URL: " + definition['url'] + "\n"
	    if 'g' in options:
		self.output += u"> Gramatika: " + definition['grammar'] + "\n"
	    if 'd' in options:
		self.output += u"> Definicije: " + definition['definition'] + "\n"
	    if 's' in options:
		self.output += u"> Sintagma: " + definition['syntagm'] + "\n"
	    if 'f' in options:
		self.output += u"> Frazeologija: " + definition['phraseology'] + "\n"
	    if 'o' in options:
		self.output += u"> Onomastika: " + definition['onomastics'] + "\n"
	    if 'e' in options:
		self.output += u"> Etimologija: " + definition['etymology'] + "\n"
	    self.output += "\n"
	
	print self.output


if __name__ == "__main__":

    if len(sys.argv) < 2 or '-' in sys.argv[1]:
	  """ help """
	  sys.exit('Usage: %s word [options]\n'
		   'Options return only specific information about word:\n'
		   '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
		   '\t-g\tgrammar\n\t-d\tdefinition\n'
		   '\t-s\tsyntagm\n\t-p\tphraseology\n'
		   '\t-o\tonomastics\n\t-e\tetymology\n'
		   '\tMultiple combinations are allowed too;\n'
		   '\twithout any, program returns all information' % sys.argv[0])
    else:
	  query = sys.argv[1]
	  options = 'nugdspoe'

    if len(sys.argv) > 2:
	  options = ''.join( sys.argv[2:] )
    
    word = Word()
    word.set_options( options )
    word.search( query )
    word.print_clean()