Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Inačica od 18:56, 8. travnja 2011. koju je unio/unijela 4ndY (razgovor | doprinosi)

Skoči na: orijentacija, traži

Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na ovoj adresi.

#!/usr/bin/python
# -*- coding: utf-8 -*-

# author: Andrej Dundovic
# date: 4. 2011.
# contact: andrej@dundovic.com.hr

import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import re, htmlentitydefs
import sys

class Word:
    """ class for fetching words from hjp.srce.hr """
    
    options = 'nugdspoe' # default options (all)
    useful_data = ''
    output = ''
    data = dict()
    service_url = 'http://hjp.srce.hr/index.php?show=search'
    target_word = '';
    
    
    def get_content( self ):
	""" get page and parse useful content """
	
	post_data = urllib.urlencode( { 'word' : self.target_word } )
	req = urllib2.Request( self.service_url, post_data )
	response = urllib2.urlopen( req )
	
	soup = BeautifulSoup( response )
	
	return str( soup.find( attrs={ "class" : "natuknica" } ) )
    
    def unescape( self, text ):
	def fixup(m):
	    text = m.group(0)
	    if text[:2] == "&#":
		# character reference
		try:
		    if text[:3] == "&#x":
			return unichr(int(text[3:-1], 16))
		    else:
			return unichr(int(text[2:-1]))
		except ValueError:
		    pass
	    else:
		# named entity
		try:
		    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
		except KeyError:
		    pass
	    return text # leave as is
	return re.sub("&#?\w+;", fixup, text)
    
    def remove_html_tags( self, string ):
	""" removes HTML tags and convert HTML escaped chars in single character equivalent """
	
	p = re.compile(r'<.*?>')
	return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
    
    def extract_data( self ):
	""" extract clean data from HTML """
	
	self.data['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
	self.data['url'] = self.useful_data.find('a')['href'] # word URL
	self.data['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
	self.data['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
	self.data['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
	self.data['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
	self.data['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
	self.data['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
  
    def __init__( self, word ):
      
	self.target_word = word
	
	subpage = self.get_content()
	
	if ( subpage == 'None' ):
	    self.output = "Rijec nije nadena"
	else:
	    self.useful_data = BeautifulSoup( subpage )
	    self.extract_data()
	    if ( self.data['name'] == 'None' ):
		""" multiple choice """
		word_url = self.useful_data.findAll('td')
		for choice in word_url[2:]:
		    link = BeautifulSoup( str( choice ) )
		    self.service_url = "http://hjp.srce.hr/"+link.find('a')['href']
		    self.target_word = ''
		    subpage = self.get_content()
		    self.useful_data = BeautifulSoup( subpage )
		    self.extract_data()
		    self.results_formatting( 1 )
	    else:
		""" single choice """
		self.results_formatting( 0 )

    def results_formatting( self, append ):
	if append <> 1:
	    self.output = ''
	else:
	    self.output += ''
	    
	if 'n' in self.options:
	    self.output += "<h2>" + self.data['name'] + "</h2>"
	if 'g' in self.options:
	    self.output += self.data['grammar']
	if 'd' in self.options and self.data['definition'] <> '':
	    self.output += "<br /><br /><b>Definicija</b><hr />" + self.data['definition']
	if 's' in self.options and self.data['syntagm'] <> '':
	    self.output += "<br /><br /><b>Sintagma</b><hr />" + self.data['syntagm']
	if 'f' in self.options and self.data['phraseology'] <> '':
	    self.output += "<br /><br /><b>Frazeologija</b><hr />" + self.data['phraseology']
	if 'o' in self.options and self.data['onomastics'] <> '':
	    self.output += "<br /><br /><b>Onomastika</b><hr />" + self.data['onomastics']
	if 'e' in self.options and self.data['etymology'] <> '':
	    self.output += "<br /><br /><b>Etimologija</b><hr />" + self.data['etymology']
	if 'u' in self.options:
	    self.output += "<br /><br /><b>URL</b><hr />" + self.data['url']

    def results( self ):
	return self.output

    def print_all( self ):
	if 'n' in options:
	    print u"> Rijec: ", self.data['name']
	if 'u' in options:
	    print u"> URL: ", self.data['url']
	if 'g' in options:
	    print u"> Gramatika: ", self.data['grammar']
	if 'd' in options:
	    print u"> Definicije: ", self.data['definition']
	if 's' in options:
	    print u"> Sintagma: ", self.data['syntagm']
	if 'f' in options:
	    print u"> Frazeologija: ", self.data['phraseology']
	if 'o' in options:
	    print u"> Onomastika: ", self.data['onomastics']
	if 'e' in options:
	    print u"> Etimologija: ", self.data['etymology']
	

# main
if __name__ == "__main__":
    if len(sys.argv) < 2 or '-' in sys.argv[1]:
	  sys.exit('Usage: %s word [options]\n'
		   'Options return only specific information about word:\n'
		   '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
		   '\t-g\tgrammar\n\t-d\tdefinition\n'
		   '\t-s\tsyntagm\n\t-p\tphraseology\n'
		   '\t-o\tonomastics\n\t-e\tetymology\n'
		   '\tMultiple combinations are allowed too;\n'
		   '\twithout any, program returns all information' % sys.argv[0])
    else:
	  query = sys.argv[1]
	  options = 'nugdspoe'

    if len(sys.argv) > 2:
	  options = ''.join( sys.argv[2:] )
    
    word = Word( query )
    word.options = options
    word.print_all()