Aplikacja rozpoznająca język tekstu w języku Python. Autor: Maciej Czekaj

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
<pre>#!/usr/bin/env python
# -*- coding: UTF-8 -*-

'''
Created on 03-09-2011

@author: Maciej Czekaj <natanielcz@gmail.com>
'''


import os
import pwd
import string
import math

path_to_base = '/home/'+pwd.getpwuid(os.getuid())[0]+'/Language Detector/'
path_to_benchmark = '/home/'+pwd.getpwuid(os.getuid())[0]+'/Language Detector - benchmark/'
pol = ['[pol]polarny1.txt','[pol]6 GŁOSÓW OD ŚREDNIOWIECZA czyli O JA PIERDOLE!!!.txt','[pol]ŚMIERĆ LEPPERA - CHŁOP, PREMIER I--- TCHÓRZ czyli POŚMIERTNE WSPOMINKI.txt','[pol]rekurencja.txt']
eng = ['[eng]Red_Box.txt','[eng]The_London_Underground.txt','[eng]The Traditional English Wedding.txt','[eng]Enjoy A Floating Picnic At The Thames.txt','[eng]PlexyDesk – A Widget Filled Desktop For Linux.txt']
deu = ['[deu]Laatste maand van de Berlijnse Muur.txt','[deu]2009 - Rauchfrei starten.txt','[deu]Die langfristigen Trends.txt','[deu]Schmuggler mit einem Kilo Kokain im Bauch gefasst.txt','[deu]Acht Amish hinter Gittern.txt',"[deu]O'zapft is – die Wiesn hat begonnen.txt",'[deu]Tausende Apple-Fans stürmen Hamburger Store.txt','[deu]Die Qual mit dem Wahlrecht.txt']
base = {'pol':pol,'eng':eng,'deu':deu}
heurystyka = {}

def Normalize(count,one_language = True):
    normalized = {}
    if one_language:
        for letter in count.keys(): normalized[letter] = float(count[letter]) / float(max(count.values()))
    else:
        for lang in count.keys():
            normalized.update({lang:{}})
            for name in count[lang].keys():
                normalized[lang].update({name:{}})
                for letter in count[lang][name].keys(): normalized[lang][name][letter] = float(count[lang][name][letter]) / float(max(count[lang][name].values()))
    return normalized

def Count_chars(baza,one_language = True):
    count = {}
    if one_language:
        for i in baza:
            if i in string.letters:
                if i in count: count[i] += 1
                else: count[i] = 1
        return count
    else:
        counts = {}
        for lang in baza:
            counts.update({lang:{}})
            for name in base[lang]:
                counts[lang].update({name:{}})
                tresc = baza[lang][name]
                for i in tresc:
                    if i in string.letters:
                        if i in counts[lang][name]: counts[lang][name][i] += 1
                        else: counts[lang][name][i] = 1
        return counts

def Load_Saved_Files(path):
    tresc_pol = {}
    tresc_eng = {}
    tresc_deu = {}
    print 'Base reading...'
    for file_id in range(len(os.listdir(path))):
        name = os.listdir(path)[file_id]
        if name.split('.')[1] == 'txt':
            if name in pol:
                temp = open(path+name).read()
                if name in tresc_pol.keys(): tresc_pol[name] += temp.lower()
                else: tresc_pol[name] = temp.lower()
                print name
            elif name in eng:
                temp = open(path+name).read()
                if name in tresc_eng.keys(): tresc_eng[name] += temp.lower()
                else: tresc_eng[name] = temp.lower()
                print name
            elif name in deu:
                temp = open(path+name).read()
                if name in tresc_deu.keys(): tresc_deu[name] += temp.lower()
                else: tresc_deu[name] = temp.lower()
                print name
    tresc = {'pol':tresc_pol,'eng':tresc_eng,'deu':tresc_deu}
    return tresc

def Distance(counts,counts2):
    distance = {}
    for lang in counts:
        distance.update({lang:{}})
        for name in counts[lang]:
            distance[lang].update({name:0})
            to_sqrt = 0
            for letter in counts[lang][name]: to_sqrt += (counts[lang][name][letter]-counts2[letter])**2
            distance[lang][name] = math.sqrt(to_sqrt)

    return distance

def Heurystyka(Languages):
    short_langs = []
    counts = []
    heurystyka = []
    for place in range(len(Languages)): short_langs.append(Languages[place][0][1:4])
    for lang in short_langs: counts.append(short_langs.count(lang))
    for place in range(len(Languages)): heurystyka.append(((place+1+counts[place])/2)*(1/float(Languages[place][1])))
    Language = short_langs[heurystyka.index(max(heurystyka))]
    return Language

def Detect_Language(counts,counts2):
    Language = {}
    for lang in counts:
        for name in counts[lang]:
            if counts[lang][name].keys() != counts2.keys():
                for letter in counts[lang][name]:
                    if letter not in counts2:
                        counts2[letter] = 0
    for letter in counts2:
        for lang in counts:
            for name in counts[lang]:
                if letter not in counts[lang][name].keys():
                    counts[lang][name][letter] = 0
    distance = Distance(counts, counts2)
    minimal = []
    for lang in distance.keys(): # For elements in distance
        for name in distance[lang].keys(): # For keys in distnace[lang]
            if minimal.__len__() < 3: minimal.append(name) # If minimal has less elements than 3: append key from distance[lang]
            else: # If minimal has 3 elements:
                temp = distance[lang][name] # przypisanie zmiennej tymczasowej wartosci kolejnej wczytanej odleglosci
                temp2 = ''
                for lang2 in distance:
                    for nameMin in minimal:
                        if nameMin in distance[lang2].keys() and distance[lang2][nameMin] > temp: # Jeśli element z minimal jest w tym jezyku i distance tego elementu jest wiekszy od temp:
                            temp = distance[lang]
                            temp2 = nameMin
                if temp2!='': minimal[minimal.index(temp2)] = name
    for el in minimal: Language.update({el:distance[el[1:4]][el]})
    Language = Heurystyka(sorted(Language.items(),None,None,True))
    return Language

def main():
    tresc = Load_Saved_Files(path_to_base)
    name = raw_input("\nSpecify the path to a text my_file, and I'll tell you what language it is written: ")
    print "File '"+name.split('/')[-1]+"' opened"
    Language = Detect_Language(Normalize(Count_chars(tresc,False),False), Normalize(Count_chars(open(name).read().lower())))
    print '\nDetected language: ',Language

    return 0

if __name__ == '__main__':
    main()</pre>