Razlika između inačica stranice Suradnik:4ndY/hjp.py
Izvor: HrOpenWiki
m |
m |
||
Redak 9: | Redak 9: | ||
if len(sys.argv) < 2: | if len(sys.argv) < 2: | ||
− | sys.exit('Usage: %s word' % sys.argv[0]) | + | sys.exit('Usage: %s word [options]\n options:\n\t-n\tname\n\t-u\tURL on HJP\n\t-g\tgrammar\n\t-d\tdefinition\n\t-s\tsyntagm\n\t-p\tphraseology\n\t-o\tonomastics\n\t-e\tetymology' % sys.argv[0]) |
else: | else: | ||
query = sys.argv[1] | query = sys.argv[1] | ||
+ | options = 'nugdspoe' | ||
+ | |||
+ | if len(sys.argv) > 2: | ||
+ | options = ''.join( sys.argv[2:] ) | ||
service_url = 'http://hjp.srce.hr/index.php?show=search' | service_url = 'http://hjp.srce.hr/index.php?show=search' | ||
Redak 45: | Redak 49: | ||
def remove_html_tags(data): | def remove_html_tags(data): | ||
p = re.compile(r'<.*?>') | p = re.compile(r'<.*?>') | ||
− | return p.sub('', unescape( str( data ) ) ) | + | return p.sub('', unescape( str( data ).strip() ) ) |
class Word: | class Word: | ||
Redak 74: | Redak 78: | ||
def print_all( self ): | def print_all( self ): | ||
− | print u"Rijec: ", self.data['name'] | + | if 'n' in options: |
− | print u"URL: ", self.data['word_url'] | + | print u"Rijec: ", self.data['name'] |
− | print u"Gramatika: ", self.data['grammar'] | + | if 'u' in options: |
− | print u"Definicije: ", self.data['definition'] | + | print u"URL: ", self.data['word_url'] |
− | print u"Sintagma: ", self.data['syntagm'] | + | if 'g' in options: |
− | print u"Frazeologija: ", self.data['phraseology'] | + | print u"Gramatika: ", self.data['grammar'] |
− | print u"Onomastika: ", self.data['onomastics'] | + | if 'd' in options: |
− | print u"Etimologija: ", self.data['etymology'] | + | print u"Definicije: ", self.data['definition'] |
+ | if 's' in options: | ||
+ | print u"Sintagma: ", self.data['syntagm'] | ||
+ | if 'f' in options: | ||
+ | print u"Frazeologija: ", self.data['phraseology'] | ||
+ | if 'o' in options: | ||
+ | print u"Onomastika: ", self.data['onomastics'] | ||
+ | if 'e' in options: | ||
+ | print u"Etimologija: ", self.data['etymology'] | ||
Redak 97: | Redak 109: | ||
word = Word( str( soup_tmp.find( attrs={ "class" : "natuknica" } ) ) ) | word = Word( str( soup_tmp.find( attrs={ "class" : "natuknica" } ) ) ) | ||
word.print_all() | word.print_all() | ||
+ | |||
else: | else: | ||
+ | """ single choice """ | ||
word.print_all() | word.print_all() | ||
</pre> | </pre> |
Inačica od 22:08, 3. travnja 2011.
#!/usr/bin/python import urllib import urllib2 import sys from BeautifulSoup import BeautifulSoup import re, htmlentitydefs if len(sys.argv) < 2: sys.exit('Usage: %s word [options]\n options:\n\t-n\tname\n\t-u\tURL on HJP\n\t-g\tgrammar\n\t-d\tdefinition\n\t-s\tsyntagm\n\t-p\tphraseology\n\t-o\tonomastics\n\t-e\tetymology' % sys.argv[0]) else: query = sys.argv[1] options = 'nugdspoe' if len(sys.argv) > 2: options = ''.join( sys.argv[2:] ) service_url = 'http://hjp.srce.hr/index.php?show=search' values = { 'word' : query } data = urllib.urlencode( values ) def get_page( service_url, data ): req = urllib2.Request( service_url, data ) response = urllib2.urlopen( req ) return response.read() def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def remove_html_tags(data): p = re.compile(r'<.*?>') return p.sub('', unescape( str( data ).strip() ) ) class Word: useful_data = '' data = dict() def extract_data( self ): # word self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word URL self.data['word_url'] = self.useful_data.find('a')['href'] # grammar info self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # definition self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # sintagm self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # phraseology self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # onomastics self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # etymology self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) def __init__( self, raw_data ): self.useful_data = BeautifulSoup( raw_data ) self.extract_data() def print_all( self ): if 'n' in options: print u"Rijec: ", self.data['name'] if 'u' in options: print u"URL: ", self.data['word_url'] if 'g' in options: print u"Gramatika: ", self.data['grammar'] if 'd' in options: print u"Definicije: ", self.data['definition'] if 's' in options: print u"Sintagma: ", self.data['syntagm'] if 'f' in options: print u"Frazeologija: ", self.data['phraseology'] if 'o' in options: print u"Onomastika: ", self.data['onomastics'] if 'e' in options: print u"Etimologija: ", self.data['etymology'] soup = BeautifulSoup( get_page( service_url, data ) ) word = Word( str( soup.find( attrs={ "class" : "natuknica" } ) ) ) if ( word.data['name'] == 'None' ): """ multiple choice """ word_url = word.useful_data.findAll('td') for choice in word_url[2:]: link = BeautifulSoup( str( choice ) ) url = "http://hjp.srce.hr/"+link.find('a')['href'] soup_tmp = BeautifulSoup( get_page( url, '' ) ) word = Word( str( soup_tmp.find( attrs={ "class" : "natuknica" } ) ) ) word.print_all() else: """ single choice """ word.print_all()