Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Inačica od 22:09, 3. travnja 2011. koju je unio/unijela 4ndY (razgovor | doprinosi)

(razl) ←Starija inačica | vidi trenutačnu inačicu (razl) | Novija inačica→ (razl)
Skoči na: orijentacija, traži
  1. !/usr/bin/python

import urllib import urllib2 import sys from BeautifulSoup import BeautifulSoup import re, htmlentitydefs

if len(sys.argv) < 2:

   sys.exit('Usage: %s word' % sys.argv[0])

else:

   query = sys.argv[1]

service_url = 'http://hjp.srce.hr/index.php?show=search' values = { 'word' : query } data = urllib.urlencode( values )

def get_page( service_url, data ):

   req = urllib2.Request( service_url, data )
   response = urllib2.urlopen( req )
   return response.read()

def unescape(text):

   def fixup(m):
       text = m.group(0)
       if text[:2] == "&#":
           # character reference
           try:
               if text[:3] == "&#x":
                   return unichr(int(text[3:-1], 16))
               else:
                   return unichr(int(text[2:-1]))
           except ValueError:
               pass
       else:
           # named entity
           try:
               text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
           except KeyError:
               pass
       return text # leave as is
   return re.sub("&#?\w+;", fixup, text)

def remove_html_tags(data):

   p = re.compile(r'<.*?>')
   return p.sub(, unescape( str( data ) ) )

class Word:

   useful_data = 
   data = dict()
   
   def extract_data( self ):

# word self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word URL self.data['word_url'] = self.useful_data.find('a')['href'] # grammar info self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # definition self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # sintagm self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # phraseology self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # onomastics self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # etymology self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) )

   def __init__(self, raw_data ):

self.useful_data = BeautifulSoup( raw_data ) self.extract_data()

soup = BeautifulSoup( get_page( service_url, data ) )


word = Word( str( soup.find( attrs={ "class" : "natuknica" } ) ) )

if ( word.data['name'] == 'None' ):

   """ multiple choice """
   word_url = word.useful_data.findAll('td')
   for choice in word_url[2:]:

link = BeautifulSoup( str( choice ) ) print remove_html_tags( choice ) print "URL: ", "http://hjp.srce.hr/"+link.find('a')['href'] else:

   print u"Rijec: ", word.data['name']
   print u"URL: ", word.data['word_url']
   print u"Gramatika: ", word.data['grammar']
   print u"Definicije: ", word.data['definition']
   print u"Sintagma: ", word.data['syntagm']
   print u"Frazeologija: ", word.data['phraseology']
   print u"Onomastika: ", word.data['onomastics']
   print u"Etimologija: ", word.data['etymology']