Aplikacja rozpoznająca język tekstu w języku Python. Autor: Maciej Czekaj
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | <pre>#!/usr/bin/env python # -*- coding: UTF-8 -*- ''' Created on 03-09-2011 @author: Maciej Czekaj <natanielcz@gmail.com> ''' import os import pwd import string import math path_to_base = '/home/'+pwd.getpwuid(os.getuid())[0]+'/Language Detector/' path_to_benchmark = '/home/'+pwd.getpwuid(os.getuid())[0]+'/Language Detector - benchmark/' pol = ['[pol]polarny1.txt','[pol]6 GŁOSÓW OD ŚREDNIOWIECZA czyli O JA PIERDOLE!!!.txt','[pol]ŚMIERĆ LEPPERA - CHŁOP, PREMIER I--- TCHÓRZ czyli POŚMIERTNE WSPOMINKI.txt','[pol]rekurencja.txt'] eng = ['[eng]Red_Box.txt','[eng]The_London_Underground.txt','[eng]The Traditional English Wedding.txt','[eng]Enjoy A Floating Picnic At The Thames.txt','[eng]PlexyDesk – A Widget Filled Desktop For Linux.txt'] deu = ['[deu]Laatste maand van de Berlijnse Muur.txt','[deu]2009 - Rauchfrei starten.txt','[deu]Die langfristigen Trends.txt','[deu]Schmuggler mit einem Kilo Kokain im Bauch gefasst.txt','[deu]Acht Amish hinter Gittern.txt',"[deu]O'zapft is – die Wiesn hat begonnen.txt",'[deu]Tausende Apple-Fans stürmen Hamburger Store.txt','[deu]Die Qual mit dem Wahlrecht.txt'] base = {'pol':pol,'eng':eng,'deu':deu} heurystyka = {} def Normalize(count,one_language = True): normalized = {} if one_language: for letter in count.keys(): normalized[letter] = float(count[letter]) / float(max(count.values())) else: for lang in count.keys(): normalized.update({lang:{}}) for name in count[lang].keys(): normalized[lang].update({name:{}}) for letter in count[lang][name].keys(): normalized[lang][name][letter] = float(count[lang][name][letter]) / float(max(count[lang][name].values())) return normalized def Count_chars(baza,one_language = True): count = {} if one_language: for i in baza: if i in string.letters: if i in count: count[i] += 1 else: count[i] = 1 return count else: counts = {} for lang in baza: counts.update({lang:{}}) for name in base[lang]: counts[lang].update({name:{}}) tresc = baza[lang][name] for i in tresc: if i in string.letters: if i in counts[lang][name]: counts[lang][name][i] += 1 else: counts[lang][name][i] = 1 return counts def Load_Saved_Files(path): tresc_pol = {} tresc_eng = {} tresc_deu = {} print 'Base reading...' for file_id in range(len(os.listdir(path))): name = os.listdir(path)[file_id] if name.split('.')[1] == 'txt': if name in pol: temp = open(path+name).read() if name in tresc_pol.keys(): tresc_pol[name] += temp.lower() else: tresc_pol[name] = temp.lower() print name elif name in eng: temp = open(path+name).read() if name in tresc_eng.keys(): tresc_eng[name] += temp.lower() else: tresc_eng[name] = temp.lower() print name elif name in deu: temp = open(path+name).read() if name in tresc_deu.keys(): tresc_deu[name] += temp.lower() else: tresc_deu[name] = temp.lower() print name tresc = {'pol':tresc_pol,'eng':tresc_eng,'deu':tresc_deu} return tresc def Distance(counts,counts2): distance = {} for lang in counts: distance.update({lang:{}}) for name in counts[lang]: distance[lang].update({name:0}) to_sqrt = 0 for letter in counts[lang][name]: to_sqrt += (counts[lang][name][letter]-counts2[letter])**2 distance[lang][name] = math.sqrt(to_sqrt) return distance def Heurystyka(Languages): short_langs = [] counts = [] heurystyka = [] for place in range(len(Languages)): short_langs.append(Languages[place][0][1:4]) for lang in short_langs: counts.append(short_langs.count(lang)) for place in range(len(Languages)): heurystyka.append(((place+1+counts[place])/2)*(1/float(Languages[place][1]))) Language = short_langs[heurystyka.index(max(heurystyka))] return Language def Detect_Language(counts,counts2): Language = {} for lang in counts: for name in counts[lang]: if counts[lang][name].keys() != counts2.keys(): for letter in counts[lang][name]: if letter not in counts2: counts2[letter] = 0 for letter in counts2: for lang in counts: for name in counts[lang]: if letter not in counts[lang][name].keys(): counts[lang][name][letter] = 0 distance = Distance(counts, counts2) minimal = [] for lang in distance.keys(): # For elements in distance for name in distance[lang].keys(): # For keys in distnace[lang] if minimal.__len__() < 3: minimal.append(name) # If minimal has less elements than 3: append key from distance[lang] else: # If minimal has 3 elements: temp = distance[lang][name] # przypisanie zmiennej tymczasowej wartosci kolejnej wczytanej odleglosci temp2 = '' for lang2 in distance: for nameMin in minimal: if nameMin in distance[lang2].keys() and distance[lang2][nameMin] > temp: # Jeśli element z minimal jest w tym jezyku i distance tego elementu jest wiekszy od temp: temp = distance[lang] temp2 = nameMin if temp2!='': minimal[minimal.index(temp2)] = name for el in minimal: Language.update({el:distance[el[1:4]][el]}) Language = Heurystyka(sorted(Language.items(),None,None,True)) return Language def main(): tresc = Load_Saved_Files(path_to_base) name = raw_input("\nSpecify the path to a text my_file, and I'll tell you what language it is written: ") print "File '"+name.split('/')[-1]+"' opened" Language = Detect_Language(Normalize(Count_chars(tresc,False),False), Normalize(Count_chars(open(name).read().lower()))) print '\nDetected language: ',Language return 0 if __name__ == '__main__': main()</pre> |
Show Comments