Razlika između inačica stranice Suradnik:4ndY/hjp.py
Izvor: HrOpenWiki
m |
m |
||
Redak 1: | Redak 1: | ||
<pre> | <pre> | ||
#!/usr/bin/python | #!/usr/bin/python | ||
+ | # -*- coding: utf-8 -*- | ||
+ | |||
# author: Andrej Dundovic | # author: Andrej Dundovic | ||
− | # | + | # date: 4. 2011. |
− | # | + | # contact: andrej@dundovic.com.hr |
import urllib | import urllib | ||
import urllib2 | import urllib2 | ||
− | |||
from BeautifulSoup import BeautifulSoup | from BeautifulSoup import BeautifulSoup | ||
import re, htmlentitydefs | import re, htmlentitydefs | ||
− | + | import sys | |
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
class Word: | class Word: | ||
+ | """ class for fetching words from hjp.srce.hr """ | ||
+ | |||
+ | options = 'nugdspoe' # default options (all) | ||
useful_data = '' | useful_data = '' | ||
+ | output = '' | ||
data = dict() | data = dict() | ||
+ | service_url = 'http://hjp.srce.hr/index.php?show=search' | ||
+ | target_word = ''; | ||
+ | |||
+ | |||
+ | def get_content( self ): | ||
+ | """ get page and parse useful content """ | ||
+ | |||
+ | post_data = urllib.urlencode( { 'word' : self.target_word } ) | ||
+ | req = urllib2.Request( self.service_url, post_data ) | ||
+ | response = urllib2.urlopen( req ) | ||
+ | |||
+ | soup = BeautifulSoup( response ) | ||
+ | |||
+ | return str( soup.find( attrs={ "class" : "natuknica" } ) ) | ||
+ | |||
+ | def unescape( self, text ): | ||
+ | def fixup(m): | ||
+ | text = m.group(0) | ||
+ | if text[:2] == "&#": | ||
+ | # character reference | ||
+ | try: | ||
+ | if text[:3] == "&#x": | ||
+ | return unichr(int(text[3:-1], 16)) | ||
+ | else: | ||
+ | return unichr(int(text[2:-1])) | ||
+ | except ValueError: | ||
+ | pass | ||
+ | else: | ||
+ | # named entity | ||
+ | try: | ||
+ | text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | ||
+ | except KeyError: | ||
+ | pass | ||
+ | return text # leave as is | ||
+ | return re.sub("&#?\w+;", fixup, text) | ||
+ | |||
+ | def remove_html_tags( self, string ): | ||
+ | """ removes HTML tags and convert HTML escaped chars in single character equivalent """ | ||
+ | |||
+ | p = re.compile(r'<.*?>') | ||
+ | return p.sub('', self.unescape( str( string ).strip() ) ).strip() | ||
def extract_data( self ): | def extract_data( self ): | ||
− | + | """ extract clean data from HTML """ | |
− | self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) | + | |
− | + | self.data['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word | |
− | self.data[' | + | self.data['url'] = self.useful_data.find('a')['href'] # word URL |
− | + | self.data['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info | |
− | self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) | + | self.data['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition |
− | + | self.data['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm | |
− | self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) | + | self.data['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology |
− | + | self.data['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics | |
− | self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) | + | self.data['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology |
− | + | ||
− | self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) | + | |
− | + | ||
− | self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) | + | |
− | + | ||
− | self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) | + | |
− | def __init__( self, | + | def __init__( self, word ): |
− | self.useful_data = BeautifulSoup( | + | |
− | + | self.target_word = word | |
+ | |||
+ | subpage = self.get_content() | ||
+ | |||
+ | if ( subpage == 'None' ): | ||
+ | self.output = "Rijec nije nadena" | ||
+ | else: | ||
+ | self.useful_data = BeautifulSoup( subpage ) | ||
+ | self.extract_data() | ||
+ | if ( self.data['name'] == 'None' ): | ||
+ | """ multiple choice """ | ||
+ | word_url = self.useful_data.findAll('td') | ||
+ | for choice in word_url[2:]: | ||
+ | link = BeautifulSoup( str( choice ) ) | ||
+ | self.service_url = "http://hjp.srce.hr/"+link.find('a')['href'] | ||
+ | self.target_word = '' | ||
+ | subpage = self.get_content() | ||
+ | self.useful_data = BeautifulSoup( subpage ) | ||
+ | self.extract_data() | ||
+ | self.results_formatting( 1 ) | ||
+ | else: | ||
+ | """ single choice """ | ||
+ | self.results_formatting( 0 ) | ||
+ | |||
+ | def results_formatting( self, append ): | ||
+ | if append <> 1: | ||
+ | self.output = '' | ||
+ | else: | ||
+ | self.output += '' | ||
+ | |||
+ | if 'n' in self.options: | ||
+ | self.output += "<h2>" + self.data['name'] + "</h2>" | ||
+ | if 'g' in self.options: | ||
+ | self.output += self.data['grammar'] | ||
+ | if 'd' in self.options and self.data['definition'] <> '': | ||
+ | self.output += "<br /><br /><b>Definicija</b><hr />" + self.data['definition'] | ||
+ | if 's' in self.options and self.data['syntagm'] <> '': | ||
+ | self.output += "<br /><br /><b>Sintagma</b><hr />" + self.data['syntagm'] | ||
+ | if 'f' in self.options and self.data['phraseology'] <> '': | ||
+ | self.output += "<br /><br /><b>Frazeologija</b><hr />" + self.data['phraseology'] | ||
+ | if 'o' in self.options and self.data['onomastics'] <> '': | ||
+ | self.output += "<br /><br /><b>Onomastika</b><hr />" + self.data['onomastics'] | ||
+ | if 'e' in self.options and self.data['etymology'] <> '': | ||
+ | self.output += "<br /><br /><b>Etimologija</b><hr />" + self.data['etymology'] | ||
+ | if 'u' in self.options: | ||
+ | self.output += "<br /><br /><b>URL</b><hr />" + self.data['url'] | ||
+ | |||
+ | def results( self ): | ||
+ | return self.output | ||
def print_all( self ): | def print_all( self ): | ||
Redak 91: | Redak 130: | ||
print u"> Rijec: ", self.data['name'] | print u"> Rijec: ", self.data['name'] | ||
if 'u' in options: | if 'u' in options: | ||
− | print u"> URL: ", self.data[' | + | print u"> URL: ", self.data['url'] |
if 'g' in options: | if 'g' in options: | ||
print u"> Gramatika: ", self.data['grammar'] | print u"> Gramatika: ", self.data['grammar'] | ||
Redak 106: | Redak 145: | ||
− | + | # main | |
− | + | if __name__ == "__main__": | |
− | + | if len(sys.argv) < 2 or '-' in sys.argv[1]: | |
− | if | + | sys.exit('Usage: %s word [options]\n' |
− | + | 'Options return only specific information about word:\n' | |
− | + | '\t-n\tword (with accents)\n\t-u\tURL on HJP\n' | |
− | + | '\t-g\tgrammar\n\t-d\tdefinition\n' | |
− | if ( | + | '\t-s\tsyntagm\n\t-p\tphraseology\n' |
− | + | '\t-o\tonomastics\n\t-e\tetymology\n' | |
− | + | '\tMultiple combinations are allowed too;\n' | |
− | + | '\twithout any, program returns all information' % sys.argv[0]) | |
− | + | ||
− | + | ||
− | + | ||
− | + | ||
− | + | ||
else: | else: | ||
− | + | query = sys.argv[1] | |
− | + | options = 'nugdspoe' | |
+ | |||
+ | if len(sys.argv) > 2: | ||
+ | options = ''.join( sys.argv[2:] ) | ||
+ | |||
+ | word = Word( query ) | ||
+ | word.options = options | ||
+ | word.print_all() | ||
</pre> | </pre> |
Inačica od 18:46, 8. travnja 2011.
#!/usr/bin/python # -*- coding: utf-8 -*- # author: Andrej Dundovic # date: 4. 2011. # contact: andrej@dundovic.com.hr import urllib import urllib2 from BeautifulSoup import BeautifulSoup import re, htmlentitydefs import sys class Word: """ class for fetching words from hjp.srce.hr """ options = 'nugdspoe' # default options (all) useful_data = '' output = '' data = dict() service_url = 'http://hjp.srce.hr/index.php?show=search' target_word = ''; def get_content( self ): """ get page and parse useful content """ post_data = urllib.urlencode( { 'word' : self.target_word } ) req = urllib2.Request( self.service_url, post_data ) response = urllib2.urlopen( req ) soup = BeautifulSoup( response ) return str( soup.find( attrs={ "class" : "natuknica" } ) ) def unescape( self, text ): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def remove_html_tags( self, string ): """ removes HTML tags and convert HTML escaped chars in single character equivalent """ p = re.compile(r'<.*?>') return p.sub('', self.unescape( str( string ).strip() ) ).strip() def extract_data( self ): """ extract clean data from HTML """ self.data['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word self.data['url'] = self.useful_data.find('a')['href'] # word URL self.data['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info self.data['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition self.data['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm self.data['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology self.data['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics self.data['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology def __init__( self, word ): self.target_word = word subpage = self.get_content() if ( subpage == 'None' ): self.output = "Rijec nije nadena" else: self.useful_data = BeautifulSoup( subpage ) self.extract_data() if ( self.data['name'] == 'None' ): """ multiple choice """ word_url = self.useful_data.findAll('td') for choice in word_url[2:]: link = BeautifulSoup( str( choice ) ) self.service_url = "http://hjp.srce.hr/"+link.find('a')['href'] self.target_word = '' subpage = self.get_content() self.useful_data = BeautifulSoup( subpage ) self.extract_data() self.results_formatting( 1 ) else: """ single choice """ self.results_formatting( 0 ) def results_formatting( self, append ): if append <> 1: self.output = '' else: self.output += '' if 'n' in self.options: self.output += "<h2>" + self.data['name'] + "</h2>" if 'g' in self.options: self.output += self.data['grammar'] if 'd' in self.options and self.data['definition'] <> '': self.output += "<br /><br /><b>Definicija</b><hr />" + self.data['definition'] if 's' in self.options and self.data['syntagm'] <> '': self.output += "<br /><br /><b>Sintagma</b><hr />" + self.data['syntagm'] if 'f' in self.options and self.data['phraseology'] <> '': self.output += "<br /><br /><b>Frazeologija</b><hr />" + self.data['phraseology'] if 'o' in self.options and self.data['onomastics'] <> '': self.output += "<br /><br /><b>Onomastika</b><hr />" + self.data['onomastics'] if 'e' in self.options and self.data['etymology'] <> '': self.output += "<br /><br /><b>Etimologija</b><hr />" + self.data['etymology'] if 'u' in self.options: self.output += "<br /><br /><b>URL</b><hr />" + self.data['url'] def results( self ): return self.output def print_all( self ): if 'n' in options: print u"> Rijec: ", self.data['name'] if 'u' in options: print u"> URL: ", self.data['url'] if 'g' in options: print u"> Gramatika: ", self.data['grammar'] if 'd' in options: print u"> Definicije: ", self.data['definition'] if 's' in options: print u"> Sintagma: ", self.data['syntagm'] if 'f' in options: print u"> Frazeologija: ", self.data['phraseology'] if 'o' in options: print u"> Onomastika: ", self.data['onomastics'] if 'e' in options: print u"> Etimologija: ", self.data['etymology'] # main if __name__ == "__main__": if len(sys.argv) < 2 or '-' in sys.argv[1]: sys.exit('Usage: %s word [options]\n' 'Options return only specific information about word:\n' '\t-n\tword (with accents)\n\t-u\tURL on HJP\n' '\t-g\tgrammar\n\t-d\tdefinition\n' '\t-s\tsyntagm\n\t-p\tphraseology\n' '\t-o\tonomastics\n\t-e\tetymology\n' '\tMultiple combinations are allowed too;\n' '\twithout any, program returns all information' % sys.argv[0]) else: query = sys.argv[1] options = 'nugdspoe' if len(sys.argv) > 2: options = ''.join( sys.argv[2:] ) word = Word( query ) word.options = options word.print_all()