Razlika između inačica stranice Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Skoči na: orijentacija, traži
m
m
Redak 5: Redak 5:
 
# -*- coding: utf-8 -*-
 
# -*- coding: utf-8 -*-
  
 +
###################################
 
# author: Andrej Dundovic
 
# author: Andrej Dundovic
# date: 4. 2011.
+
# date: 5. 2012.
 
# contact: andrej@dundovic.com.hr
 
# contact: andrej@dundovic.com.hr
 
# licence: GPLv3
 
# licence: GPLv3
 +
# description: script for croatian
 +
# dictionary (HJP) look-up
 +
###################################
  
import urllib
+
import requests
import urllib2
+
 
from BeautifulSoup import BeautifulSoup
 
from BeautifulSoup import BeautifulSoup
 
import re, htmlentitydefs
 
import re, htmlentitydefs
Redak 17: Redak 20:
  
 
class Word:
 
class Word:
     """ class for fetching words from hjp.srce.hr """
+
     """ class for fetching words from hjp.novi-liber.hr """
 +
 
 +
    def __init__( self ):
 +
""" init method, define some variables """
 +
 +
self.options = 'nugdspoe' # default options (all)
 +
self.useful_data = ''
 +
self.output = ''
 +
self.data = []
 +
self.target_url = ''
 +
self.service_url = 'http://hjp.novi-liber.hr/'
 +
self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php'
 +
self.target_word = '';
 +
self.words = []
 +
 +
 
 +
    def set_options( self, options ):
 +
""" set which parts of definition will be displayed """
 +
 +
self.options = options
 +
 
 +
 +
    def get_options( self ):
 +
""" get which parts of definition will be displayed """
 +
 +
return self.options
 +
 +
 +
    def search( self, word ):
 +
""" search for word - THE method """
 +
 +
# look for a word
 +
self.target_word = word
 +
 +
# get list of possible word autocompletion
 +
self.get_autocomplete()
 +
 +
# print autocompletion list
 +
if len( self.words ) > 1:
 +
    for word in self.words:
 +
print word
 +
    print
 +
 +
# take the first one from the list
 +
self.target_word = self.words[0]
 +
self.target_url = self.service_url + 'index.php?show=search'
 +
self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) )
 +
 +
# test is the word homonym
 +
if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None:
 +
    self.multiple_definition()
 +
 +
else:
 +
    self.extract_data()
 
      
 
      
    options = 'nugdspoe' # default options (all)
+
     def get_content( self, post_data ):
    useful_data = ''
+
    output = ''
+
    data = dict()
+
    service_url = 'http://hjp.srce.hr/index.php?show=search'
+
    target_word = '';
+
   
+
   
+
     def get_content( self ):
+
 
""" get page and parse useful content """
 
""" get page and parse useful content """
 
 
post_data = urllib.urlencode( { 'word' : self.target_word } )
+
req = requests.post( self.target_url, data = post_data )
req = urllib2.Request( self.service_url, post_data )
+
response = urllib2.urlopen( req )
+
 
 
soup = BeautifulSoup( response )
+
soup = BeautifulSoup( req.text )
 
 
 
return str( soup.find( attrs={ "class" : "natuknica" } ) )
 
return str( soup.find( attrs={ "class" : "natuknica" } ) )
 +
 +
   
 +
    def get_autocomplete( self ):
 +
""" get list of possible word autocompletion """
 +
 +
        get_data = { 'q': self.target_word, 'limit': 10, 's': 's' }
 +
        req = requests.get( self.autocompl_url, params = get_data, timeout = 1 )
 +
 +
        self.words = req.text.split('\n')[0:-1]
 +
   
 +
   
 +
    def multiple_definition( self ):
 +
""" word is homonym so get all homonyms """
 +
 +
        for choice in self.useful_data.findAll('a'):
 +
            self.target_url = self.service_url + choice.get('href')
 +
            self.useful_data = BeautifulSoup( self.get_content( '' ) )
 +
            self.extract_data()
 +
   
 
      
 
      
 
     def unescape( self, text ):
 
     def unescape( self, text ):
 +
""" fix for HTML characters """
 +
 
def fixup(m):
 
def fixup(m):
 
    text = m.group(0)
 
    text = m.group(0)
Redak 58: Redak 125:
 
    return text # leave as is
 
    return text # leave as is
 
return re.sub("&#?\w+;", fixup, text)
 
return re.sub("&#?\w+;", fixup, text)
 +
 
      
 
      
 
     def remove_html_tags( self, string ):
 
     def remove_html_tags( self, string ):
Redak 64: Redak 132:
 
p = re.compile(r'<.*?>')
 
p = re.compile(r'<.*?>')
 
return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
 
return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
 +
   
 
      
 
      
 
     def extract_data( self ):
 
     def extract_data( self ):
 
""" extract clean data from HTML """
 
""" extract clean data from HTML """
 
 
self.data['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
+
self.data.append( dict() )
self.data['url'] = self.useful_data.find('a')['href'] # word URL
+
 
self.data['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
+
self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
self.data['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
+
self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL
self.data['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
+
self.data[-1]['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
self.data['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
+
self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
self.data['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
+
self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
self.data['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
+
self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
 
+
self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
    def __init__( self, word ):
+
self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
     
+
self.target_word = word
+
 
 
subpage = self.get_content()
 
 
 
if ( subpage == 'None' ):
+
    def print_html( self ):
    self.output = "Rijec nije nadena"
+
""" print HTML output """
else:
+
    self.useful_data = BeautifulSoup( subpage )
+
self.output = ''
    self.extract_data()
+
    if ( self.data['name'] == 'None' ):
+
for definition in self.data:
""" multiple choice """
+
word_url = self.useful_data.findAll('td')
+
for choice in word_url[2:]:
+
    link = BeautifulSoup( str( choice ) )
+
    self.service_url = "http://hjp.srce.hr/"+link.find('a')['href']
+
    self.target_word = ''
+
    subpage = self.get_content()
+
    self.useful_data = BeautifulSoup( subpage )
+
    self.extract_data()
+
    self.results_formatting( 1 )
+
    else:
+
""" single choice """
+
self.results_formatting( 0 )
+
 
+
    def results_formatting( self, append ):
+
if append <> 1:
+
    self.output = ''
+
else:
+
    self.output += ''
+
 
     
 
     
if 'n' in self.options:
+
    if 'n' in self.options:
    self.output += "<h2>" + self.data['name'] + "</h2>"
+
self.output += "<h2>" + definition['name'] + "</h2>"
if 'g' in self.options:
+
    if 'g' in self.options:
    self.output += self.data['grammar']
+
self.output += definition['grammar']
if 'd' in self.options and self.data['definition'] <> '':
+
    if 'd' in self.options and definition['definition'] <> '':
    self.output += "<br /><br /><b>Definicija</b><hr />" + self.data['definition']
+
self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition']
if 's' in self.options and self.data['syntagm'] <> '':
+
    if 's' in self.options and definition['syntagm'] <> '':
    self.output += "<br /><br /><b>Sintagma</b><hr />" + self.data['syntagm']
+
self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm']
if 'f' in self.options and self.data['phraseology'] <> '':
+
    if 'f' in self.options and definition['phraseology'] <> '':
    self.output += "<br /><br /><b>Frazeologija</b><hr />" + self.data['phraseology']
+
self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology']
if 'o' in self.options and self.data['onomastics'] <> '':
+
    if 'o' in self.options and definition['onomastics'] <> '':
    self.output += "<br /><br /><b>Onomastika</b><hr />" + self.data['onomastics']
+
self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics']
if 'e' in self.options and self.data['etymology'] <> '':
+
    if 'e' in self.options and definition['etymology'] <> '':
    self.output += "<br /><br /><b>Etimologija</b><hr />" + self.data['etymology']
+
self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology']
if 'u' in self.options:
+
    if 'u' in self.options:
    self.output += "<br /><br /><b>URL</b><hr />" + self.data['url']
+
self.output += "<br /><br /><b>URL</b><hr />" + definition['url']
 
+
 
     def results( self ):
+
print self.output
return self.output
+
 
+
     def print_clean( self ):
    def print_all( self ):
+
""" print clean outpu (CLI) """
if 'n' in options:
+
    print u"> Rijec: ", self.data['name']
+
self.output = ''
if 'u' in options:
+
     
    print u"> URL: ", self.data['url']
+
for definition in self.data:
if 'g' in options:
+
 
    print u"> Gramatika: ", self.data['grammar']
+
    if 'n' in options:
if 'd' in options:
+
self.output += u"> Rijec: " + definition['name'] + "\n"
    print u"> Definicije: ", self.data['definition']
+
    if 'u' in options:
if 's' in options:
+
self.output += u"> URL: " + definition['url'] + "\n"
    print u"> Sintagma: ", self.data['syntagm']
+
    if 'g' in options:
if 'f' in options:
+
self.output += u"> Gramatika: " + definition['grammar'] + "\n"
    print u"> Frazeologija: ", self.data['phraseology']
+
    if 'd' in options:
if 'o' in options:
+
self.output += u"> Definicije: " + definition['definition'] + "\n"
    print u"> Onomastika: ", self.data['onomastics']
+
    if 's' in options:
if 'e' in options:
+
self.output += u"> Sintagma: " + definition['syntagm'] + "\n"
    print u"> Etimologija: ", self.data['etymology']
+
    if 'f' in options:
 +
self.output += u"> Frazeologija: " + definition['phraseology'] + "\n"
 +
    if 'o' in options:
 +
self.output += u"> Onomastika: " + definition['onomastics'] + "\n"
 +
    if 'e' in options:
 +
self.output += u"> Etimologija: " + definition['etymology'] + "\n"
 +
    self.output += "\n"
 
 
 +
print self.output
 +
  
# main
 
 
if __name__ == "__main__":
 
if __name__ == "__main__":
 +
 
     if len(sys.argv) < 2 or '-' in sys.argv[1]:
 
     if len(sys.argv) < 2 or '-' in sys.argv[1]:
 +
  """ help """
 
  sys.exit('Usage: %s word [options]\n'
 
  sys.exit('Usage: %s word [options]\n'
 
  'Options return only specific information about word:\n'
 
  'Options return only specific information about word:\n'
Redak 166: Redak 222:
 
  options = ''.join( sys.argv[2:] )
 
  options = ''.join( sys.argv[2:] )
 
      
 
      
     word = Word( query )
+
     word = Word()
     word.options = options
+
     word.set_options( options )
     word.print_all()
+
     word.search( query )
 +
    word.print_clean()
 +
 
 
</pre>
 
</pre>

Inačica od 14:10, 26. svibnja 2012.

Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na ovoj adresi.

#!/usr/bin/python
# -*- coding: utf-8 -*-

###################################
# author: Andrej Dundovic
# date: 5. 2012.
# contact: andrej@dundovic.com.hr
# licence: GPLv3
# description: script for croatian
# dictionary (HJP) look-up
###################################

import requests 
from BeautifulSoup import BeautifulSoup
import re, htmlentitydefs
import sys

class Word:
    """ class for fetching words from hjp.novi-liber.hr """

    def __init__( self ):
	""" init method, define some variables """
	
	self.options = 'nugdspoe' # default options (all)
	self.useful_data = ''
	self.output = ''
	self.data = []
	self.target_url = ''
	self.service_url = 'http://hjp.novi-liber.hr/'
	self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php'
	self.target_word = '';
	self.words = []
	

    def set_options( self, options ):
	""" set which parts of definition will be displayed """
	
	self.options = options

	
    def get_options( self ):
	""" get which parts of definition will be displayed """
	
	return self.options
	
	
    def search( self, word ):
	""" search for word - THE method """
	
	# look for a word
	self.target_word = word
	
	# get list of possible word autocompletion
	self.get_autocomplete()
	
	# print autocompletion list
	if len( self.words ) > 1:
	    for word in self.words:
		print word
	    print
	
	# take the first one from the list
	self.target_word = self.words[0]
	self.target_url = self.service_url + 'index.php?show=search'
	self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) )
	
	# test is the word homonym
	if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None:
	    self.multiple_definition()
		
	else:
	    self.extract_data()
    
    def get_content( self, post_data ):
	""" get page and parse useful content """
	
	req = requests.post( self.target_url, data = post_data )
	
	soup = BeautifulSoup( req.text )
	
	return str( soup.find( attrs={ "class" : "natuknica" } ) )

    
    def get_autocomplete( self ):
	""" get list of possible word autocompletion """

        get_data = { 'q': self.target_word, 'limit': 10, 's': 's' }
        req = requests.get( self.autocompl_url, params = get_data, timeout = 1 )

        self.words = req.text.split('\n')[0:-1]
    
    
    def multiple_definition( self ):
	""" word is homonym so get all homonyms """

        for choice in self.useful_data.findAll('a'):
            self.target_url = self.service_url + choice.get('href')
            self.useful_data = BeautifulSoup( self.get_content( '' ) )
            self.extract_data()
    
    
    def unescape( self, text ):
	""" fix for HTML characters """

	def fixup(m):
	    text = m.group(0)
	    if text[:2] == "&#":
		# character reference
		try:
		    if text[:3] == "&#x":
			return unichr(int(text[3:-1], 16))
		    else:
			return unichr(int(text[2:-1]))
		except ValueError:
		    pass
	    else:
		# named entity
		try:
		    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
		except KeyError:
		    pass
	    return text # leave as is
	return re.sub("&#?\w+;", fixup, text)

    
    def remove_html_tags( self, string ):
	""" removes HTML tags and convert HTML escaped chars in single character equivalent """
	
	p = re.compile(r'<.*?>')
	return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
    
    
    def extract_data( self ):
	""" extract clean data from HTML """
	
	self.data.append( dict() )
	  
	self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
	self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL
	self.data[-1]['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
	self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
	self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
	self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
	self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
	self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
	
	
    def print_html( self ):
	""" print HTML output """
	
	self.output = ''
	
	for definition in self.data:
	    
	    if 'n' in self.options:
		self.output += "<h2>" + definition['name'] + "</h2>"
	    if 'g' in self.options:
		self.output += definition['grammar']
	    if 'd' in self.options and definition['definition'] <> '':
		self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition']
	    if 's' in self.options and definition['syntagm'] <> '':
		self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm']
	    if 'f' in self.options and definition['phraseology'] <> '':
		self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology']
	    if 'o' in self.options and definition['onomastics'] <> '':
		self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics']
	    if 'e' in self.options and definition['etymology'] <> '':
		self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology']
	    if 'u' in self.options:
		self.output += "<br /><br /><b>URL</b><hr />" + definition['url']
  
	print self.output
		
    def print_clean( self ):
	""" print clean outpu (CLI) """
	
	self.output = ''
      
	for definition in self.data:
	  
	    if 'n' in options:
		self.output += u"> Rijec: " + definition['name'] + "\n"
	    if 'u' in options:
		self.output += u"> URL: " + definition['url'] + "\n"
	    if 'g' in options:
		self.output += u"> Gramatika: " + definition['grammar'] + "\n"
	    if 'd' in options:
		self.output += u"> Definicije: " + definition['definition'] + "\n"
	    if 's' in options:
		self.output += u"> Sintagma: " + definition['syntagm'] + "\n"
	    if 'f' in options:
		self.output += u"> Frazeologija: " + definition['phraseology'] + "\n"
	    if 'o' in options:
		self.output += u"> Onomastika: " + definition['onomastics'] + "\n"
	    if 'e' in options:
		self.output += u"> Etimologija: " + definition['etymology'] + "\n"
	    self.output += "\n"
	
	print self.output


if __name__ == "__main__":

    if len(sys.argv) < 2 or '-' in sys.argv[1]:
	  """ help """
	  sys.exit('Usage: %s word [options]\n'
		   'Options return only specific information about word:\n'
		   '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
		   '\t-g\tgrammar\n\t-d\tdefinition\n'
		   '\t-s\tsyntagm\n\t-p\tphraseology\n'
		   '\t-o\tonomastics\n\t-e\tetymology\n'
		   '\tMultiple combinations are allowed too;\n'
		   '\twithout any, program returns all information' % sys.argv[0])
    else:
	  query = sys.argv[1]
	  options = 'nugdspoe'

    if len(sys.argv) > 2:
	  options = ''.join( sys.argv[2:] )
    
    word = Word()
    word.set_options( options )
    word.search( query )
    word.print_clean()