Macropodus commited on
Commit
cb10a62
·
verified ·
1 Parent(s): 6a64706

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -94
app.py CHANGED
@@ -1,94 +1,93 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- import gradio as gr
4
- import operator
5
- import torch
6
- from transformers import BertTokenizer, BertForMaskedLM
7
-
8
-
9
- # pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
10
- pretrained_model_name_or_path = "D:/Anacoda3/envs/py38/Lib/site-packages/macro_correct/output/text_correction/macbert4mdcspell_v2"
11
- tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path)
12
- model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path)
13
- vocab = tokenizer.vocab
14
-
15
-
16
- def func_macro_correct(text):
17
- with torch.no_grad():
18
- outputs = model(**tokenizer([text], padding=True, return_tensors='pt'))
19
-
20
- def flag_total_chinese(text):
21
- """
22
- judge is total chinese or not, 判断是不是全是中文
23
- Args:
24
- text: str, eg. "macadam, 碎石路"
25
- Returns:
26
- bool, True or False
27
- """
28
- for word in text:
29
- if not "\u4e00" <= word <= "\u9fa5":
30
- return False
31
- return True
32
-
33
- def get_errors(corrected_text, origin_text, unk_tokens=[], know_tokens=[]):
34
- """Get new corrected text and errors between corrected text and origin text
35
- code from: https://github.com/shibing624/pycorrector
36
- """
37
- errors = []
38
- unk_tokens = unk_tokens or [' ', '“', '”', '‘', '’', '琊', '\n', '…', '擤', '\t', '玕', '', ',']
39
-
40
- for i, ori_char in enumerate(origin_text):
41
- if i >= len(corrected_text):
42
- continue
43
- if ori_char in unk_tokens or ori_char not in know_tokens:
44
- # deal with unk word
45
- corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
46
- continue
47
- if ori_char != corrected_text[i]:
48
- if not flag_total_chinese(ori_char):
49
- # pass not chinese char
50
- corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
51
- continue
52
- if not flag_total_chinese(corrected_text[i]):
53
- corrected_text = corrected_text[:i] + corrected_text[i + 1:]
54
- continue
55
- errors.append([ori_char, corrected_text[i], i])
56
- errors = sorted(errors, key=operator.itemgetter(2))
57
- return corrected_text, errors
58
-
59
- _text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
60
- corrected_text = _text[:len(text)]
61
- corrected_text, details = get_errors(corrected_text, text, know_tokens=vocab)
62
- print(text, ' => ', corrected_text, details)
63
- return corrected_text + ' ' + str(details)
64
-
65
-
66
- if __name__ == '__main__':
67
- print(func_macro_correct('他法语说的很好,的语也不错'))
68
-
69
- examples = [
70
- "夫谷之雨,犹复云之亦从的起,因与疾风俱飘,参于天,集于的。",
71
- "机七学习是人工智能领遇最能体现智能的一个分知",
72
- '他们的吵翻很不错,再说他们做的咖喱鸡也好吃',
73
- "抗疫路上,除了提心吊胆也有难的得欢笑。",
74
- "我是练习时长两念半的鸽仁练习生蔡徐坤",
75
- "清晨,如纱一般地薄雾笼罩着世界。",
76
- "得府许我立庙于此,故请君移去尔。",
77
- "他法语说的很好,的语也不错",
78
- "遇到一位很棒的奴生跟我疗天",
79
- "五年级得数学,我考的很差。",
80
- "我们为这个目标努力不解",
81
- '今天兴情很好',
82
- ]
83
-
84
- gr.Interface(
85
- func_macro_correct,
86
- inputs='text',
87
- outputs='text',
88
- title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
89
- description="Copy or input error Chinese text. Submit and the machine will correct text.",
90
- article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
91
- examples=examples
92
- ).launch(server_name="0.0.0.0", server_port=8036, share=False, debug=True)
93
-
94
-
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import gradio as gr
4
+ import operator
5
+ import torch
6
+ from transformers import BertTokenizer, BertForMaskedLM
7
+
8
+
9
+ pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
10
+ tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path)
11
+ model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path)
12
+ vocab = tokenizer.vocab
13
+
14
+
15
+ def func_macro_correct(text):
16
+ with torch.no_grad():
17
+ outputs = model(**tokenizer([text], padding=True, return_tensors='pt'))
18
+
19
+ def flag_total_chinese(text):
20
+ """
21
+ judge is total chinese or not, 判断是不是全是中文
22
+ Args:
23
+ text: str, eg. "macadam, 碎石路"
24
+ Returns:
25
+ bool, True or False
26
+ """
27
+ for word in text:
28
+ if not "\u4e00" <= word <= "\u9fa5":
29
+ return False
30
+ return True
31
+
32
+ def get_errors(corrected_text, origin_text, unk_tokens=[], know_tokens=[]):
33
+ """Get new corrected text and errors between corrected text and origin text
34
+ code from: https://github.com/shibing624/pycorrector
35
+ """
36
+ errors = []
37
+ unk_tokens = unk_tokens or [' ', '“', '”', '‘', '’', '琊', '\n', '…', '擤', '\t', '玕', '', ',']
38
+
39
+ for i, ori_char in enumerate(origin_text):
40
+ if i >= len(corrected_text):
41
+ continue
42
+ if ori_char in unk_tokens or ori_char not in know_tokens:
43
+ # deal with unk word
44
+ corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
45
+ continue
46
+ if ori_char != corrected_text[i]:
47
+ if not flag_total_chinese(ori_char):
48
+ # pass not chinese char
49
+ corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
50
+ continue
51
+ if not flag_total_chinese(corrected_text[i]):
52
+ corrected_text = corrected_text[:i] + corrected_text[i + 1:]
53
+ continue
54
+ errors.append([ori_char, corrected_text[i], i])
55
+ errors = sorted(errors, key=operator.itemgetter(2))
56
+ return corrected_text, errors
57
+
58
+ _text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
59
+ corrected_text = _text[:len(text)]
60
+ corrected_text, details = get_errors(corrected_text, text, know_tokens=vocab)
61
+ print(text, ' => ', corrected_text, details)
62
+ return corrected_text + ' ' + str(details)
63
+
64
+
65
+ if __name__ == '__main__':
66
+ print(func_macro_correct('他法语说的很好,的语也不错'))
67
+
68
+ examples = [
69
+ "夫谷之雨,犹复云之亦从的起,因与疾风俱飘,参于天,集于的。",
70
+ "机七学习是人工智能领遇最能体现智能的一个分知",
71
+ '他们的吵翻很不错,再说他们做的咖喱鸡也好吃',
72
+ "抗疫路上,除了提心吊胆也有难的得欢笑。",
73
+ "我是练习时长两念半的鸽仁练习生蔡徐坤",
74
+ "清晨,如纱一般地薄雾笼罩着世界。",
75
+ "得府许我立庙于此,故请君移去尔。",
76
+ "他法语说的很好,的语也不错",
77
+ "遇到一位很棒的奴生跟我疗天",
78
+ "五年级得数学,我考的很差。",
79
+ "我们为这个目标努力不解",
80
+ '今天兴情很好',
81
+ ]
82
+
83
+ gr.Interface(
84
+ func_macro_correct,
85
+ inputs='text',
86
+ outputs='text',
87
+ title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
88
+ description="Copy or input error Chinese text. Submit and the machine will correct text.",
89
+ article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
90
+ examples=examples
91
+ ).launch(server_name="0.0.0.0", server_port=8036, share=False, debug=True)
92
+
93
+