• 博客等级：
• 博客积分：0
• 博客访问：327,974
• 关注人气：313
• 获赠金笔：0支
• 赠出金笔：0支
• 荣誉徽章：

## N元语法之n-gram(1)

(2016-05-05 11:33:36)

### 杂谈

N元语法

n-gram 中如果n=1则为unigram，n=2则为bigram，n=3则为trigram。n>4后，则直接用数字指称，如4-gram，5gram。

I will
will go
go to
to United
United States

Python语言:
sent="I will go to United States."
lst_sent=sent.split(" ")
bigram=[]
for i in range(len(lst_sent)-1):
bigram.append(lst_sent[i] + " " + lst_sent[i+1])

>>> bigram
['I will', 'will go', 'go to', 'to United', 'United States.']

import re
punctuation_pattern=re.compile(r'[.,!?'"]')
no_punctuation_sent=re.sub(punctuation, "", sent)

Python语言:
import re
punctuation_pattern=re.compile(r"""[.,!?'"]""")

sent="I will go to United States."
no_punctuation_sent=re.sub(punctuation_pattern, "", sent)
lst_sent=no_punctuation_sent.split(" ")
bigram=[]
for i in range(len(lst_sent)-1):
bigram.append(lst_sent[i] + " " + lst_sent[i+1])

trigram 如何实现？

Python语言:
import re
punctuation_pattern=re.compile(r"""[.,!?'"]""")

sent="I will go to United States."
no_punctuation_sent=re.sub(punctuation_pattern, "", sent)
lst_sent=no_punctuation_sent.split(" ")
trigram=[]
for i in range(len(lst_sent)-2):
trigram.append(lst_sent[i] + " " + lst_sent[i+1]+ " " + lst_sent[i+2])

>>> trigram
['I will go', 'will go to', 'go to United', 'to United States']

Python语言
# -*- coding: utf-8 -*-

import re
from Tkinter import Tk
#利用对话框选择文件
Tk().withdraw()

fileToProcess=open(filename,"r")

def remove_punctuation(strings):

punctuation_pattern=re.compile(r"""[.,;:!?'"\n]""")
no_punctuation_string=re.sub(punctuation_pattern,"",strings)
return no_punctuation_string

def bigram(lst_sent):
bigram=[]
for i in range(len(lst_sent)-1):
bigram.append(lst_sent[i] + " " + lst_sent[i+1])
return bigram

def trigram(lst_sent):
trigram=[]
for i in range(len(lst_sent)-2):
trigram.append(lst_sent[i] + " " + lst_sent[i+1]+ " " + lst_sent[i+2])
return trigram

clean_content=remove_punctuation(fileContent)
lst_clean_content=clean_content.split(" ")
bigramLst=bigram(lst_clean_content)
trigramLst=trigram(lst_clean_content)

>>> trigramLst[:50]
['[Emma by Jane', 'by Jane Austen', 'Jane Austen 1816]VOLUME', 'Austen 1816]VOLUME ICHAPTER', '1816]VOLUME ICHAPTER IEmma', 'ICHAPTER IEmma Woodhouse', 'IEmma Woodhouse handsome', 'Woodhouse handsome clever', 'handsome clever and', 'clever and rich', 'and rich with', 'rich with a', 'with a comfortable', 'a comfortable homeand', 'comfortable homeand happy', 'homeand happy disposition', 'happy disposition seemed', 'disposition seemed to', 'seemed to unite', 'to unite some', 'unite some of', 'some of the', 'of the best', 'the best blessingsof', 'best blessingsof existence', 'blessingsof existence and', 'existence and had', 'and had lived', 'had lived nearly', 'lived nearly twenty-one', 'nearly twenty-one years', 'twenty-one years in', 'years in the', 'in the worldwith', 'the worldwith very', 'worldwith very little', 'very little to', 'little to distress', 'to distress or', 'distress or vex', 'or vex herShe', 'vex herShe was', 'herShe was the', 'was the youngest', 'the youngest of', 'youngest of the', 'of the two', 'the two daughters', 'two daughters of', 'daughters of a']

sent=['i', 'love', 'china.', 'i', 'love', 'suzhou']
def nGram(lst,n):
ngram=[]
for i in len(sent):
if i
ngram.append(sent[i:i+n])
else:
print "Finish the process"

0