Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Inačica od 09:01, 4. travnja 2011. koju je unio/unijela 4ndY (razgovor | doprinosi)

Skoči na: orijentacija, traži
#!/usr/bin/python
# author: Andrej Dundovic
# e-mail: andrej@dundovic.com.hr
# date: 04. 2011.

import urllib
import urllib2
import sys
from BeautifulSoup import BeautifulSoup
import re, htmlentitydefs

if len(sys.argv) < 2 or '-' in sys.argv[1]:
    sys.exit('Usage: %s word [options]\n'
	     'Options return only specific information about word:\n'
	     '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
	     '\t-g\tgrammar\n\t-d\tdefinition\n'
	     '\t-s\tsyntagm\n\t-p\tphraseology\n'
	     '\t-o\tonomastics\n\t-e\tetymology\n'
	     '\tMultiple combinations are allowed too;\n'
	     '\twithout any, program returns all information' % sys.argv[0])
else:
    query = sys.argv[1]
    options = 'nugdspoe'

if len(sys.argv) > 2:
    options = ''.join( sys.argv[2:] )

service_url = 'http://hjp.srce.hr/index.php?show=search'
values = { 'word' : query }
data = urllib.urlencode( values )

def get_page( service_url, data ):
    req = urllib2.Request( service_url, data )
    response = urllib2.urlopen( req )
    return response.read()

def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

def remove_html_tags(data):
    p = re.compile(r'<.*?>')
    return p.sub('', unescape( str( data ).strip() ) )

class Word:
    useful_data = ''
    data = dict()
    
    def extract_data( self ):
	# word
	self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) )
	# word URL
	self.data['word_url'] = self.useful_data.find('a')['href']
	# grammar info
	self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) )
	# definition
	self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) )
	# sintagm
	self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) )
	# phraseology
	self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) )
	# onomastics
	self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) )
	# etymology
	self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) )  
  
    def __init__( self, raw_data ):
	self.useful_data = BeautifulSoup( raw_data )
	self.extract_data()

    def print_all( self ):
	if 'n' in options:
	    print u"> Rijec: ", self.data['name']
	if 'u' in options:
	    print u"> URL: ", self.data['word_url']
	if 'g' in options:
	    print u"> Gramatika: ", self.data['grammar']
	if 'd' in options:
	    print u"> Definicije: ", self.data['definition']
	if 's' in options:
	    print u"> Sintagma: ", self.data['syntagm']
	if 'f' in options:
	    print u"> Frazeologija: ", self.data['phraseology']
	if 'o' in options:
	    print u"> Onomastika: ", self.data['onomastics']
	if 'e' in options:
	    print u"> Etimologija: ", self.data['etymology']
	

soup = BeautifulSoup( get_page( service_url, data ) )
subpage = str( soup.find( attrs={ "class" : "natuknica" } ) )

if ( subpage == 'None' ):
    print "Word not found"
else:
    word = Word( subpage )
    if ( word.data['name'] == 'None' ):
	""" multiple choice """
	word_url = word.useful_data.findAll('td')
	for choice in word_url[2:]:
	    link = BeautifulSoup( str( choice ) )
	    url = "http://hjp.srce.hr/"+link.find('a')['href']
	    soup_tmp = BeautifulSoup( get_page( url, '' ) )
	    word = Word( str( soup_tmp.find( attrs={ "class" : "natuknica" } ) ) )
	    word.print_all()
    else:
	""" single choice """
	word.print_all()