Razlika između inačica stranice Suradnik:4ndY/hjp.py

Izvor: HrOpenWiki
Skoči na: orijentacija, traži
m
m
Redak 1: Redak 1:
 
<pre>
 
<pre>
 
#!/usr/bin/python
 
#!/usr/bin/python
 +
# -*- coding: utf-8 -*-
 +
 
# author: Andrej Dundovic
 
# author: Andrej Dundovic
# e-mail: andrej@dundovic.com.hr
+
# date: 4. 2011.
# date: 04. 2011.
+
# contact: andrej@dundovic.com.hr
  
 
import urllib
 
import urllib
 
import urllib2
 
import urllib2
import sys
 
 
from BeautifulSoup import BeautifulSoup
 
from BeautifulSoup import BeautifulSoup
 
import re, htmlentitydefs
 
import re, htmlentitydefs
 
+
import sys
if len(sys.argv) < 2 or '-' in sys.argv[1]:
+
    sys.exit('Usage: %s word [options]\n'
+
    'Options return only specific information about word:\n'
+
    '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
+
    '\t-g\tgrammar\n\t-d\tdefinition\n'
+
    '\t-s\tsyntagm\n\t-p\tphraseology\n'
+
    '\t-o\tonomastics\n\t-e\tetymology\n'
+
    '\tMultiple combinations are allowed too;\n'
+
    '\twithout any, program returns all information' % sys.argv[0])
+
else:
+
    query = sys.argv[1]
+
    options = 'nugdspoe'
+
 
+
if len(sys.argv) > 2:
+
    options = ''.join( sys.argv[2:] )
+
 
+
service_url = 'http://hjp.srce.hr/index.php?show=search'
+
values = { 'word' : query }
+
data = urllib.urlencode( values )
+
 
+
def get_page( service_url, data ):
+
    req = urllib2.Request( service_url, data )
+
    response = urllib2.urlopen( req )
+
    return response.read()
+
 
+
def unescape(text):
+
    def fixup(m):
+
        text = m.group(0)
+
        if text[:2] == "&#":
+
            # character reference
+
            try:
+
                if text[:3] == "&#x":
+
                    return unichr(int(text[3:-1], 16))
+
                else:
+
                    return unichr(int(text[2:-1]))
+
            except ValueError:
+
                pass
+
        else:
+
            # named entity
+
            try:
+
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+
            except KeyError:
+
                pass
+
        return text # leave as is
+
    return re.sub("&#?\w+;", fixup, text)
+
 
+
def remove_html_tags(data):
+
    p = re.compile(r'<.*?>')
+
    return p.sub('', unescape( str( data ).strip() ) )
+
  
 
class Word:
 
class Word:
 +
    """ class for fetching words from hjp.srce.hr """
 +
   
 +
    options = 'nugdspoe' # default options (all)
 
     useful_data = ''
 
     useful_data = ''
 +
    output = ''
 
     data = dict()
 
     data = dict()
 +
    service_url = 'http://hjp.srce.hr/index.php?show=search'
 +
    target_word = '';
 +
   
 +
   
 +
    def get_content( self ):
 +
""" get page and parse useful content """
 +
 +
post_data = urllib.urlencode( { 'word' : self.target_word } )
 +
req = urllib2.Request( self.service_url, post_data )
 +
response = urllib2.urlopen( req )
 +
 +
soup = BeautifulSoup( response )
 +
 +
return str( soup.find( attrs={ "class" : "natuknica" } ) )
 +
   
 +
    def unescape( self, text ):
 +
def fixup(m):
 +
    text = m.group(0)
 +
    if text[:2] == "&#":
 +
# character reference
 +
try:
 +
    if text[:3] == "&#x":
 +
return unichr(int(text[3:-1], 16))
 +
    else:
 +
return unichr(int(text[2:-1]))
 +
except ValueError:
 +
    pass
 +
    else:
 +
# named entity
 +
try:
 +
    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
 +
except KeyError:
 +
    pass
 +
    return text # leave as is
 +
return re.sub("&#?\w+;", fixup, text)
 +
   
 +
    def remove_html_tags( self, string ):
 +
""" removes HTML tags and convert HTML escaped chars in single character equivalent """
 +
 +
p = re.compile(r'<.*?>')
 +
return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
 
      
 
      
 
     def extract_data( self ):
 
     def extract_data( self ):
# word
+
""" extract clean data from HTML """
self.data['name'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) )
+
# word URL
+
self.data['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
self.data['word_url'] = self.useful_data.find('a')['href']
+
self.data['url'] = self.useful_data.find('a')['href'] # word URL
# grammar info
+
self.data['grammar'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
self.data['grammar'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) )
+
self.data['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
# definition
+
self.data['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
self.data['definition'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) )
+
self.data['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
# sintagm
+
self.data['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
self.data['syntagm'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) )
+
self.data['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
# phraseology
+
self.data['phraseology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) )
+
# onomastics
+
self.data['onomastics'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) )
+
# etymology
+
self.data['etymology'] = remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) )
+
 
    
 
    
     def __init__( self, raw_data ):
+
     def __init__( self, word ):
self.useful_data = BeautifulSoup( raw_data )
+
     
self.extract_data()
+
self.target_word = word
 +
 +
subpage = self.get_content()
 +
 +
if ( subpage == 'None' ):
 +
    self.output = "Rijec nije nadena"
 +
else:
 +
    self.useful_data = BeautifulSoup( subpage )
 +
    self.extract_data()
 +
    if ( self.data['name'] == 'None' ):
 +
""" multiple choice """
 +
word_url = self.useful_data.findAll('td')
 +
for choice in word_url[2:]:
 +
    link = BeautifulSoup( str( choice ) )
 +
    self.service_url = "http://hjp.srce.hr/"+link.find('a')['href']
 +
    self.target_word = ''
 +
    subpage = self.get_content()
 +
    self.useful_data = BeautifulSoup( subpage )
 +
    self.extract_data()
 +
    self.results_formatting( 1 )
 +
    else:
 +
""" single choice """
 +
self.results_formatting( 0 )
 +
 
 +
    def results_formatting( self, append ):
 +
if append <> 1:
 +
    self.output = ''
 +
else:
 +
    self.output += ''
 +
   
 +
if 'n' in self.options:
 +
    self.output += "<h2>" + self.data['name'] + "</h2>"
 +
if 'g' in self.options:
 +
    self.output += self.data['grammar']
 +
if 'd' in self.options and self.data['definition'] <> '':
 +
    self.output += "<br /><br /><b>Definicija</b><hr />" + self.data['definition']
 +
if 's' in self.options and self.data['syntagm'] <> '':
 +
    self.output += "<br /><br /><b>Sintagma</b><hr />" + self.data['syntagm']
 +
if 'f' in self.options and self.data['phraseology'] <> '':
 +
    self.output += "<br /><br /><b>Frazeologija</b><hr />" + self.data['phraseology']
 +
if 'o' in self.options and self.data['onomastics'] <> '':
 +
    self.output += "<br /><br /><b>Onomastika</b><hr />" + self.data['onomastics']
 +
if 'e' in self.options and self.data['etymology'] <> '':
 +
    self.output += "<br /><br /><b>Etimologija</b><hr />" + self.data['etymology']
 +
if 'u' in self.options:
 +
    self.output += "<br /><br /><b>URL</b><hr />" + self.data['url']
 +
 
 +
    def results( self ):
 +
return self.output
  
 
     def print_all( self ):
 
     def print_all( self ):
Redak 91: Redak 130:
 
    print u"> Rijec: ", self.data['name']
 
    print u"> Rijec: ", self.data['name']
 
if 'u' in options:
 
if 'u' in options:
    print u"> URL: ", self.data['word_url']
+
    print u"> URL: ", self.data['url']
 
if 'g' in options:
 
if 'g' in options:
 
    print u"> Gramatika: ", self.data['grammar']
 
    print u"> Gramatika: ", self.data['grammar']
Redak 106: Redak 145:
 
 
  
soup = BeautifulSoup( get_page( service_url, data ) )
+
# main
subpage = str( soup.find( attrs={ "class" : "natuknica" } ) )
+
if __name__ == "__main__":
 
+
     if len(sys.argv) < 2 or '-' in sys.argv[1]:
if ( subpage == 'None' ):
+
  sys.exit('Usage: %s word [options]\n'
    print "Word not found"
+
  'Options return only specific information about word:\n'
else:
+
  '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
    word = Word( subpage )
+
  '\t-g\tgrammar\n\t-d\tdefinition\n'
     if ( word.data['name'] == 'None' ):
+
  '\t-s\tsyntagm\n\t-p\tphraseology\n'
""" multiple choice """
+
  '\t-o\tonomastics\n\t-e\tetymology\n'
word_url = word.useful_data.findAll('td')
+
  '\tMultiple combinations are allowed too;\n'
for choice in word_url[2:]:
+
  '\twithout any, program returns all information' % sys.argv[0])
    link = BeautifulSoup( str( choice ) )
+
    url = "http://hjp.srce.hr/"+link.find('a')['href']
+
    soup_tmp = BeautifulSoup( get_page( url, '' ) )
+
    word = Word( str( soup_tmp.find( attrs={ "class" : "natuknica" } ) ) )
+
    word.print_all()
+
 
     else:
 
     else:
""" single choice """
+
  query = sys.argv[1]
word.print_all()
+
  options = 'nugdspoe'
 +
 
 +
    if len(sys.argv) > 2:
 +
  options = ''.join( sys.argv[2:] )
 +
   
 +
    word = Word( query )
 +
    word.options = options
 +
    word.print_all()
 
</pre>
 
</pre>

Inačica od 19:46, 8. travnja 2011.

#!/usr/bin/python
# -*- coding: utf-8 -*-

# author: Andrej Dundovic
# date: 4. 2011.
# contact: andrej@dundovic.com.hr

import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import re, htmlentitydefs
import sys

class Word:
    """ class for fetching words from hjp.srce.hr """
    
    options = 'nugdspoe' # default options (all)
    useful_data = ''
    output = ''
    data = dict()
    service_url = 'http://hjp.srce.hr/index.php?show=search'
    target_word = '';
    
    
    def get_content( self ):
	""" get page and parse useful content """
	
	post_data = urllib.urlencode( { 'word' : self.target_word } )
	req = urllib2.Request( self.service_url, post_data )
	response = urllib2.urlopen( req )
	
	soup = BeautifulSoup( response )
	
	return str( soup.find( attrs={ "class" : "natuknica" } ) )
    
    def unescape( self, text ):
	def fixup(m):
	    text = m.group(0)
	    if text[:2] == "&#":
		# character reference
		try:
		    if text[:3] == "&#x":
			return unichr(int(text[3:-1], 16))
		    else:
			return unichr(int(text[2:-1]))
		except ValueError:
		    pass
	    else:
		# named entity
		try:
		    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
		except KeyError:
		    pass
	    return text # leave as is
	return re.sub("&#?\w+;", fixup, text)
    
    def remove_html_tags( self, string ):
	""" removes HTML tags and convert HTML escaped chars in single character equivalent """
	
	p = re.compile(r'<.*?>')
	return p.sub('',  self.unescape( str( string ).strip() ) ).strip()
    
    def extract_data( self ):
	""" extract clean data from HTML """
	
	self.data['name'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "natuknica_raster_frame" } ) ) # word
	self.data['url'] = self.useful_data.find('a')['href'] # word URL
	self.data['grammar'] =  self.remove_html_tags( self.useful_data.find( attrs={ "id" : "osnovni_podaci_frame" } ) ) # grammar info
	self.data['definition'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "definicija_frame" } ) ) # definition
	self.data['syntagm'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "sintagma_frame" } ) ) # sintagm
	self.data['phraseology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "frazeologija_frame" } ) ) # phraseology
	self.data['onomastics'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "onomastika_frame" } ) ) # onomastics
	self.data['etymology'] = self.remove_html_tags( self.useful_data.find( attrs={ "id" : "etimologija_frame" } ) ) # etymology
  
    def __init__( self, word ):
      
	self.target_word = word
	
	subpage = self.get_content()
	
	if ( subpage == 'None' ):
	    self.output = "Rijec nije nadena"
	else:
	    self.useful_data = BeautifulSoup( subpage )
	    self.extract_data()
	    if ( self.data['name'] == 'None' ):
		""" multiple choice """
		word_url = self.useful_data.findAll('td')
		for choice in word_url[2:]:
		    link = BeautifulSoup( str( choice ) )
		    self.service_url = "http://hjp.srce.hr/"+link.find('a')['href']
		    self.target_word = ''
		    subpage = self.get_content()
		    self.useful_data = BeautifulSoup( subpage )
		    self.extract_data()
		    self.results_formatting( 1 )
	    else:
		""" single choice """
		self.results_formatting( 0 )

    def results_formatting( self, append ):
	if append <> 1:
	    self.output = ''
	else:
	    self.output += ''
	    
	if 'n' in self.options:
	    self.output += "<h2>" + self.data['name'] + "</h2>"
	if 'g' in self.options:
	    self.output += self.data['grammar']
	if 'd' in self.options and self.data['definition'] <> '':
	    self.output += "<br /><br /><b>Definicija</b><hr />" + self.data['definition']
	if 's' in self.options and self.data['syntagm'] <> '':
	    self.output += "<br /><br /><b>Sintagma</b><hr />" + self.data['syntagm']
	if 'f' in self.options and self.data['phraseology'] <> '':
	    self.output += "<br /><br /><b>Frazeologija</b><hr />" + self.data['phraseology']
	if 'o' in self.options and self.data['onomastics'] <> '':
	    self.output += "<br /><br /><b>Onomastika</b><hr />" + self.data['onomastics']
	if 'e' in self.options and self.data['etymology'] <> '':
	    self.output += "<br /><br /><b>Etimologija</b><hr />" + self.data['etymology']
	if 'u' in self.options:
	    self.output += "<br /><br /><b>URL</b><hr />" + self.data['url']

    def results( self ):
	return self.output

    def print_all( self ):
	if 'n' in options:
	    print u"> Rijec: ", self.data['name']
	if 'u' in options:
	    print u"> URL: ", self.data['url']
	if 'g' in options:
	    print u"> Gramatika: ", self.data['grammar']
	if 'd' in options:
	    print u"> Definicije: ", self.data['definition']
	if 's' in options:
	    print u"> Sintagma: ", self.data['syntagm']
	if 'f' in options:
	    print u"> Frazeologija: ", self.data['phraseology']
	if 'o' in options:
	    print u"> Onomastika: ", self.data['onomastics']
	if 'e' in options:
	    print u"> Etimologija: ", self.data['etymology']
	

# main
if __name__ == "__main__":
    if len(sys.argv) < 2 or '-' in sys.argv[1]:
	  sys.exit('Usage: %s word [options]\n'
		   'Options return only specific information about word:\n'
		   '\t-n\tword (with accents)\n\t-u\tURL on HJP\n'
		   '\t-g\tgrammar\n\t-d\tdefinition\n'
		   '\t-s\tsyntagm\n\t-p\tphraseology\n'
		   '\t-o\tonomastics\n\t-e\tetymology\n'
		   '\tMultiple combinations are allowed too;\n'
		   '\twithout any, program returns all information' % sys.argv[0])
    else:
	  query = sys.argv[1]
	  options = 'nugdspoe'

    if len(sys.argv) > 2:
	  options = ''.join( sys.argv[2:] )
    
    word = Word( query )
    word.options = options
    word.print_all()