Razlika između inačica stranice Suradnik:4ndY/hjp.py
Izvor: HrOpenWiki
m |
m |
||
Nije prikazano 11 međuinačica istog suradnika | |||
Redak 1: | Redak 1: | ||
+ | '''Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na [https://gitorious.org/hjp-plasmoid/hjp-plasmoid/trees/master ovoj] adresi.''' | ||
+ | |||
<pre> | <pre> | ||
#!/usr/bin/python | #!/usr/bin/python | ||
+ | # -*- coding: utf-8 -*- | ||
− | + | ################################### | |
− | + | # author: Andrej Dundovic | |
− | import | + | # date: 5. 2012. |
+ | # contact: andrej AT dundovic DOT com.hr | ||
+ | # licence: GPLv3 | ||
+ | # description: script for croatian | ||
+ | # dictionary (HJP) look-up | ||
+ | ################################### | ||
+ | |||
+ | import requests | ||
from BeautifulSoup import BeautifulSoup | from BeautifulSoup import BeautifulSoup | ||
import re, htmlentitydefs | import re, htmlentitydefs | ||
+ | import sys | ||
− | + | class Word: | |
− | + | """ class for fetching words from hjp.novi-liber.hr """ | |
− | + | ||
− | + | ||
− | + | ||
− | + | def __init__( self ): | |
− | + | """ init method, define some variables """ | |
+ | |||
+ | self.options = 'nugdspoe' # default options (all) | ||
+ | self.useful_data = '' | ||
+ | self.output = '' | ||
+ | self.data = [] | ||
+ | self.target_url = '' | ||
+ | self.service_url = 'http://hjp.novi-liber.hr/' | ||
+ | self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php' | ||
+ | self.target_word = ''; | ||
+ | self.words = [] | ||
+ | |||
− | + | def set_options( self, options ): | |
− | + | """ set which parts of definition will be displayed """ | |
− | + | ||
+ | self.options = options | ||
− | def | + | |
− | req = | + | def get_options( self ): |
− | + | """ get which parts of definition will be displayed """ | |
− | + | ||
+ | return self.options | ||
+ | |||
+ | |||
+ | def search( self, word ): | ||
+ | """ search for word - THE method """ | ||
+ | |||
+ | # look for a word | ||
+ | self.target_word = word | ||
+ | |||
+ | # get list of possible word autocompletion | ||
+ | self.get_autocomplete() | ||
+ | |||
+ | # print autocompletion list | ||
+ | if len( self.words ) > 1: | ||
+ | for word in self.words: | ||
+ | print word | ||
+ | print | ||
+ | |||
+ | # take the first one from the list | ||
+ | self.target_word = self.words[0] | ||
+ | self.target_url = self.service_url + 'index.php?show=search' | ||
+ | self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) ) | ||
+ | |||
+ | # test is the word homonym | ||
+ | if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None: | ||
+ | self.multiple_definition() | ||
+ | |||
+ | else: | ||
+ | self.extract_data() | ||
+ | |||
+ | def get_content( self, post_data ): | ||
+ | """ get page and parse useful content """ | ||
+ | |||
+ | req = requests.post( self.target_url, data = post_data ) | ||
+ | |||
+ | soup = BeautifulSoup( req.text ) | ||
+ | |||
+ | return str( soup.find( attrs={ "class" : "natuknica" } ) ) | ||
− | + | ||
− | def | + | def get_autocomplete( self ): |
− | + | """ get list of possible word autocompletion """ | |
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | get_data = { 'q': self.target_word, 'limit': 10, 's': 's' } | |
− | + | req = requests.get( self.autocompl_url, params = get_data, timeout = 1 ) | |
− | + | ||
− | + | self.words = req.text.split('\n')[0:-1] | |
− | useful_data = '' | + | |
− | + | ||
+ | def multiple_definition( self ): | ||
+ | """ word is homonym so get all homonyms """ | ||
+ | |||
+ | for choice in self.useful_data.findAll('a'): | ||
+ | self.target_url = self.service_url + choice.get('href') | ||
+ | self.useful_data = BeautifulSoup( self.get_content( '' ) ) | ||
+ | self.extract_data() | ||
+ | |||
+ | |||
+ | def unescape( self, text ): | ||
+ | """ fix for HTML characters """ | ||
+ | |||
+ | def fixup(m): | ||
+ | text = m.group(0) | ||
+ | if text[:2] == "&#": | ||
+ | # character reference | ||
+ | try: | ||
+ | if text[:3] == "&#x": | ||
+ | return unichr(int(text[3:-1], 16)) | ||
+ | else: | ||
+ | return unichr(int(text[2:-1])) | ||
+ | except ValueError: | ||
+ | pass | ||
+ | else: | ||
+ | # named entity | ||
+ | try: | ||
+ | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | ||
+ | except KeyError: | ||
+ | pass | ||
+ | return text # leave as is | ||
+ | return re.sub("&#?\w+;", fixup, text) | ||
+ | |||
+ | |||
+ | def remove_html_tags( self, string ): | ||
+ | """ removes HTML tags and convert HTML escaped chars in single character equivalent """ | ||
+ | |||
+ | p = re.compile(r'<.*?>') | ||
+ | return p.sub('', self.unescape( str( string ).strip() ) ).strip() | ||
+ | |||
def extract_data( self ): | def extract_data( self ): | ||
− | + | """ extract clean data from HTML """ | |
− | self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) | + | |
− | + | self.data.append( dict() ) | |
− | self.data[' | + | |
− | + | self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word | |
− | self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) | + | self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL |
− | + | self.data[-1]['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info | |
− | self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) | + | self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition |
− | + | self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm | |
− | self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) | + | self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology |
− | + | self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics | |
− | self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) | + | self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology |
− | + | ||
− | self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) | + | |
− | + | def print_html( self ): | |
− | self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) | + | """ print HTML output """ |
+ | |||
+ | self.output = '' | ||
+ | |||
+ | for definition in self.data: | ||
+ | |||
+ | if 'n' in self.options: | ||
+ | self.output += "<h2>" + definition['name'] + "</h2>" | ||
+ | if 'g' in self.options: | ||
+ | self.output += definition['grammar'] | ||
+ | if 'd' in self.options and definition['definition'] <> '': | ||
+ | self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition'] | ||
+ | if 's' in self.options and definition['syntagm'] <> '': | ||
+ | self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm'] | ||
+ | if 'f' in self.options and definition['phraseology'] <> '': | ||
+ | self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology'] | ||
+ | if 'o' in self.options and definition['onomastics'] <> '': | ||
+ | self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics'] | ||
+ | if 'e' in self.options and definition['etymology'] <> '': | ||
+ | self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology'] | ||
+ | if 'u' in self.options: | ||
+ | self.output += "<br /><br /><b>URL</b><hr />" + definition['url'] | ||
− | def | + | print self.output |
− | + | ||
− | self. | + | def print_clean( self ): |
− | + | """ print clean outpu (CLI) """ | |
− | + | ||
− | + | self.output = '' | |
− | + | ||
− | + | for definition in self.data: | |
− | + | ||
− | + | if 'n' in options: | |
− | + | self.output += u"> Rijec: " + definition['name'] + "\n" | |
− | + | if 'u' in options: | |
− | + | self.output += u"> URL: " + definition['url'] + "\n" | |
− | + | if 'g' in options: | |
− | + | self.output += u"> Gramatika: " + definition['grammar'] + "\n" | |
− | + | if 'd' in options: | |
− | + | self.output += u"> Definicije: " + definition['definition'] + "\n" | |
− | + | if 's' in options: | |
− | + | self.output += u"> Sintagma: " + definition['syntagm'] + "\n" | |
− | + | if 'f' in options: | |
− | + | self.output += u"> Frazeologija: " + definition['phraseology'] + "\n" | |
+ | if 'o' in options: | ||
+ | self.output += u"> Onomastika: " + definition['onomastics'] + "\n" | ||
+ | if 'e' in options: | ||
+ | self.output += u"> Etimologija: " + definition['etymology'] + "\n" | ||
+ | self.output += "\n" | ||
+ | print self.output | ||
− | |||
− | + | if __name__ == "__main__": | |
− | if ( | + | if len(sys.argv) < 2 or '-' in sys.argv[1]: |
− | + | """ help """ | |
− | + | sys.exit('Usage: %s word [options]\n' | |
− | + | 'Options return only specific information about word:\n' | |
− | + | '\t-n\tword (with accents)\n\t-u\tURL on HJP\n' | |
− | + | '\t-g\tgrammar\n\t-d\tdefinition\n' | |
− | + | '\t-s\tsyntagm\n\t-p\tphraseology\n' | |
− | + | '\t-o\tonomastics\n\t-e\tetymology\n' | |
− | + | '\tMultiple combinations are allowed too;\n' | |
+ | '\twithout any, program returns all information' % sys.argv[0]) | ||
+ | else: | ||
+ | query = sys.argv[1] | ||
+ | options = 'nugdspoe' | ||
+ | |||
+ | if len(sys.argv) > 2: | ||
+ | options = ''.join( sys.argv[2:] ) | ||
+ | |||
+ | word = Word() | ||
+ | word.set_options( options ) | ||
+ | word.search( query ) | ||
+ | word.print_clean() | ||
− | |||
− | |||
− | |||
</pre> | </pre> |
Trenutačna izmjena od 00:06, 12. studenoga 2012.
Ova skripta se više ne održava na ovoj lokaciji, već se aktualna verzija može naći na ovoj adresi.
#!/usr/bin/python # -*- coding: utf-8 -*- ################################### # author: Andrej Dundovic # date: 5. 2012. # contact: andrej AT dundovic DOT com.hr # licence: GPLv3 # description: script for croatian # dictionary (HJP) look-up ################################### import requests from BeautifulSoup import BeautifulSoup import re, htmlentitydefs import sys class Word: """ class for fetching words from hjp.novi-liber.hr """ def __init__( self ): """ init method, define some variables """ self.options = 'nugdspoe' # default options (all) self.useful_data = '' self.output = '' self.data = [] self.target_url = '' self.service_url = 'http://hjp.novi-liber.hr/' self.autocompl_url = 'http://hjp.novi-liber.hr/hjp_ajax.php' self.target_word = ''; self.words = [] def set_options( self, options ): """ set which parts of definition will be displayed """ self.options = options def get_options( self ): """ get which parts of definition will be displayed """ return self.options def search( self, word ): """ search for word - THE method """ # look for a word self.target_word = word # get list of possible word autocompletion self.get_autocomplete() # print autocompletion list if len( self.words ) > 1: for word in self.words: print word print # take the first one from the list self.target_word = self.words[0] self.target_url = self.service_url + 'index.php?show=search' self.useful_data = BeautifulSoup( self.get_content( { 'word': self.target_word } ) ) # test is the word homonym if self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) is None: self.multiple_definition() else: self.extract_data() def get_content( self, post_data ): """ get page and parse useful content """ req = requests.post( self.target_url, data = post_data ) soup = BeautifulSoup( req.text ) return str( soup.find( attrs={ "class" : "natuknica" } ) ) def get_autocomplete( self ): """ get list of possible word autocompletion """ get_data = { 'q': self.target_word, 'limit': 10, 's': 's' } req = requests.get( self.autocompl_url, params = get_data, timeout = 1 ) self.words = req.text.split('\n')[0:-1] def multiple_definition( self ): """ word is homonym so get all homonyms """ for choice in self.useful_data.findAll('a'): self.target_url = self.service_url + choice.get('href') self.useful_data = BeautifulSoup( self.get_content( '' ) ) self.extract_data() def unescape( self, text ): """ fix for HTML characters """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def remove_html_tags( self, string ): """ removes HTML tags and convert HTML escaped chars in single character equivalent """ p = re.compile(r'<.*?>') return p.sub('', self.unescape( str( string ).strip() ) ).strip() def extract_data( self ): """ extract clean data from HTML """ self.data.append( dict() ) self.data[-1]['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word self.data[-1]['url'] = self.useful_data.find('a')['href'] # word URL self.data[-1]['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info self.data[-1]['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition self.data[-1]['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm self.data[-1]['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology self.data[-1]['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics self.data[-1]['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology def print_html( self ): """ print HTML output """ self.output = '' for definition in self.data: if 'n' in self.options: self.output += "<h2>" + definition['name'] + "</h2>" if 'g' in self.options: self.output += definition['grammar'] if 'd' in self.options and definition['definition'] <> '': self.output += "<br /><br /><b>Definicija</b><hr />" + definition['definition'] if 's' in self.options and definition['syntagm'] <> '': self.output += "<br /><br /><b>Sintagma</b><hr />" + definition['syntagm'] if 'f' in self.options and definition['phraseology'] <> '': self.output += "<br /><br /><b>Frazeologija</b><hr />" + definition['phraseology'] if 'o' in self.options and definition['onomastics'] <> '': self.output += "<br /><br /><b>Onomastika</b><hr />" + definition['onomastics'] if 'e' in self.options and definition['etymology'] <> '': self.output += "<br /><br /><b>Etimologija</b><hr />" + definition['etymology'] if 'u' in self.options: self.output += "<br /><br /><b>URL</b><hr />" + definition['url'] print self.output def print_clean( self ): """ print clean outpu (CLI) """ self.output = '' for definition in self.data: if 'n' in options: self.output += u"> Rijec: " + definition['name'] + "\n" if 'u' in options: self.output += u"> URL: " + definition['url'] + "\n" if 'g' in options: self.output += u"> Gramatika: " + definition['grammar'] + "\n" if 'd' in options: self.output += u"> Definicije: " + definition['definition'] + "\n" if 's' in options: self.output += u"> Sintagma: " + definition['syntagm'] + "\n" if 'f' in options: self.output += u"> Frazeologija: " + definition['phraseology'] + "\n" if 'o' in options: self.output += u"> Onomastika: " + definition['onomastics'] + "\n" if 'e' in options: self.output += u"> Etimologija: " + definition['etymology'] + "\n" self.output += "\n" print self.output if __name__ == "__main__": if len(sys.argv) < 2 or '-' in sys.argv[1]: """ help """ sys.exit('Usage: %s word [options]\n' 'Options return only specific information about word:\n' '\t-n\tword (with accents)\n\t-u\tURL on HJP\n' '\t-g\tgrammar\n\t-d\tdefinition\n' '\t-s\tsyntagm\n\t-p\tphraseology\n' '\t-o\tonomastics\n\t-e\tetymology\n' '\tMultiple combinations are allowed too;\n' '\twithout any, program returns all information' % sys.argv[0]) else: query = sys.argv[1] options = 'nugdspoe' if len(sys.argv) > 2: options = ''.join( sys.argv[2:] ) word = Word() word.set_options( options ) word.search( query ) word.print_clean()