Suradnik:4ndY/hjp.py
Izvor: HrOpenWiki
Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na ovoj adresi.
#!/usr/bin/python # -*- coding: utf-8 -*- ################################### # author: Andrej Dundovic # date: 5. 2012. # contact: andrej@dundovic.com.hr # licence: GPLv3 # description: script for croatian # dictionary (HJP) look-up ################################### import requests from BeautifulSoup import BeautifulSoup import re, htmlentitydefs import sys class Word: """ class for fetching words from hjp.novi-liber.hr """ def __init__( self ): """ init method, define some variables """ self.options = 'nugdspoe' # default options (all) self.useful_data = '' self.output = '' self.data = [] self.target_url = '' self.service_url = 'http://hjp.novi-liber.hr/' self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php' self.target_word = ''; self.words = [] def set_options( self, options ): """ set which parts of definition will be displayed """ self.options = options def get_options( self ): """ get which parts of definition will be displayed """ return self.options def search( self, word ): """ search for word - THE method """ # look for a word self.target_word = word # get list of possible word autocompletion self.get_autocomplete() # print autocompletion list if len( self.words ) > 1: for word in self.words: print word print # take the first one from the list self.target_word = self.words[0] self.target_url = self.service_url + 'index.php?show=search' self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) ) # test is the word homonym if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None: self.multiple_definition() else: self.extract_data() def get_content( self, post_data ): """ get page and parse useful content """ req = requests.post( self.target_url, data = post_data ) soup = BeautifulSoup( req.text ) return str( soup.find( attrs={ "class" : "natuknica" } ) ) def get_autocomplete( self ): """ get list of possible word autocompletion """ get_data = { 'q': self.target_word, 'limit': 10, 's': 's' } req = requests.get( self.autocompl_url, params = get_data, timeout = 1 ) self.words = req.text.split('\n')[0:-1] def multiple_definition( self ): """ word is homonym so get all homonyms """ for choice in self.useful_data.findAll('a'): self.target_url = self.service_url + choice.get('href') self.useful_data = BeautifulSoup( self.get_content( '' ) ) self.extract_data() def unescape( self, text ): """ fix for HTML characters """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def remove_html_tags( self, string ): """ removes HTML tags and convert HTML escaped chars in single character equivalent """ p = re.compile(r'<.*?>') return p.sub('', self.unescape( str( string ).strip() ) ).strip() def extract_data( self ): """ extract clean data from HTML """ self.data.append( dict() ) self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL self.data[-1]['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology def print_html( self ): """ print HTML output """ self.output = '' for definition in self.data: if 'n' in self.options: self.output += "<h2>" + definition['name'] + "</h2>" if 'g' in self.options: self.output += definition['grammar'] if 'd' in self.options and definition['definition'] <> '': self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition'] if 's' in self.options and definition['syntagm'] <> '': self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm'] if 'f' in self.options and definition['phraseology'] <> '': self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology'] if 'o' in self.options and definition['onomastics'] <> '': self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics'] if 'e' in self.options and definition['etymology'] <> '': self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology'] if 'u' in self.options: self.output += "<br /><br /><b>URL</b><hr />" + definition['url'] print self.output def print_clean( self ): """ print clean outpu (CLI) """ self.output = '' for definition in self.data: if 'n' in options: self.output += u"> Rijec: " + definition['name'] + "\n" if 'u' in options: self.output += u"> URL: " + definition['url'] + "\n" if 'g' in options: self.output += u"> Gramatika: " + definition['grammar'] + "\n" if 'd' in options: self.output += u"> Definicije: " + definition['definition'] + "\n" if 's' in options: self.output += u"> Sintagma: " + definition['syntagm'] + "\n" if 'f' in options: self.output += u"> Frazeologija: " + definition['phraseology'] + "\n" if 'o' in options: self.output += u"> Onomastika: " + definition['onomastics'] + "\n" if 'e' in options: self.output += u"> Etimologija: " + definition['etymology'] + "\n" self.output += "\n" print self.output if __name__ == "__main__": if len(sys.argv) < 2 or '-' in sys.argv[1]: """ help """ sys.exit('Usage: %s word [options]\n' 'Options return only specific information about word:\n' '\t-n\tword (with accents)\n\t-u\tURL on HJP\n' '\t-g\tgrammar\n\t-d\tdefinition\n' '\t-s\tsyntagm\n\t-p\tphraseology\n' '\t-o\tonomastics\n\t-e\tetymology\n' '\tMultiple combinations are allowed too;\n' '\twithout any, program returns all information' % sys.argv[0]) else: query = sys.argv[1] options = 'nugdspoe' if len(sys.argv) > 2: options = ''.join( sys.argv[2:] ) word = Word() word.set_options( options ) word.search( query ) word.print_clean()