前言
最近接触运动手环项目,因业务需求,需对APP进行多国语言进行翻译,因人工比对容易出错,加上需要大量的时间去反复验证。因此就想能不能通过脚本的方式去实现。
在此记录下过程:
在网上搜了一些如何用python实现google脚本,大致有三种:
1.调用google API的
2.使用别人已经封装好的库
3.类似爬虫方式获取(我没爬过,也不知道算不算)
这里采用第三种,主要是看了利用python调用谷歌翻译API这篇文章,感觉蛮简单,也感觉比较靠谱,然后就开搞了。
按照<利用python调用谷歌翻译API>这篇文章实现脚本以后,发现只能翻译成中文,而且不适合翻译多个句子。最后将这个脚本稍微改了下,支持翻译多条语句,将结果从一个Excel文档保存到另一个Excel文档中去。
一、环境准备
1、这里我使用python3.7 + pycharm
2、需要安装的库:
pip install xlwd
pip install xlrd
pip install requests
pip install PyExecJs
3、需翻译的数据
test_Language:
4、需要存储的文件:test_result_Language,,内容为空就好
二、实现代码
脚本目录:
主要代码
main.py
#!/usr/bin/python
import xlrd
import xlwt
import os
import threading
import time
from translate_google import get_translate
# 英文的列,此处翻译都是基于中文
#TRANSLATE_BASE_FIELD = "English"
TRANSLATE_BASE_FIELD = "zh-CN"
# 是否使用多线程翻译,多线程容易导致超时,适合翻译少量数据
IS_MULTITHREADING = False
# 是否显示日志
IS_DEBUG = True
# 翻译后数据存放的Excel文件
translate_file = "data/translate_result_Language.xls"
# 待翻译的文件
translate_source_file = 'data/test_Language.xls'
# 需要翻译的Excel文件
book = xlrd.open_workbook(translate_source_file)
# 默认获取第一张表
sheet = book.sheet_by_index(0)
file = xlwt.Workbook(encoding='utf-8')
def print_log(text):
"""
打印日志
"""
if IS_DEBUG:
print(str(text))
def get_english_column():
"""
获取英文字段的列
"""
tmp_english_col = 0
for col in range(1, sheet.ncols):
field_name = sheet.cell_value(0, col)
if field_name == TRANSLATE_BASE_FIELD:
print_log(field_name)
tmp_english_col = col
break
return tmp_english_col
# 需要翻译的语言
translate_dict = {
# '中文': 'zh-CN',
'英文':'english',
# # 挪威语
# 'Norwegian': 'nb',
'中文繁体': 'zh-TW',
# # 德语
# 'German': 'de',
# '韩语': 'ko',
'Japanese': 'ja',
# '法语': 'fr',
# '西班牙语': 'es',
# # 波兰语
# 'Polski': 'pl',
# # 意大利语
# 'Italian': 'it',
# # 希伯来语
# 'Hebrew (עברית(': 'iw',
# # 荷兰语
# 'Dutch': 'nl',
# # 印度尼西亚(印尼)
# 'Indonesian': 'id',
'捷克语': 'cs',
# '芬兰语': 'fi',
# # 葡萄牙语(巴西),葡萄牙语(葡萄牙)
# '葡萄牙语': 'pt',
# '罗马尼亚语': 'ro',
# '俄语': 'ru',
# '瑞典语': 'sv',
# '土耳其语': 'tr',
}
dict_len = len(translate_dict)
class ExcelUtil:
save_count = 0
def __init__(self, title, tl, english_col):
self.tl = tl
self.title = title
self.english_col = english_col
self.translate_sheet = file.add_sheet(tl)
language_list = []
language_list.append("Language")
for row in range(1, sheet.nrows):
english_field_name = sheet.cell_value(row, self.english_col)
if english_field_name is None or english_field_name == "":
print_log("empty filed!")
break
language_list.append(english_field_name)
self.write_to_sheet(language_list, 0)
def write_to_sheet(self, field_list, column=0):
"""
将内容写到Excel的工作表
:param field_list:
:param column:
:return:
"""
print_log("write_to_sheet")
row = 0
for item in field_list:
print(item)
self.translate_sheet.write(row, column, item)
row += 1
def write_to_excel(self):
language_list = self.get_translate_list()
print_log("===========================")
self.write_to_sheet(language_list, 1)
if IS_MULTITHREADING:
ExcelUtil.save_count += 1
dict_len = len(translate_dict)
print_log("ExcelUtil.save_count=%d" % ExcelUtil.save_count)
if dict_len == ExcelUtil.save_count:
print_log("ExcelUtil.save_count==dict_len")
file.save(translate_file)
print_log("===========================")
def get_translate_list(self):
print_log("title: %s, col: %s, tl: %s\n" % (self.title, self.english_col, self.tl))
language_list = []
language_list.append(self.title)
translate_text = ""
total_row = sheet.nrows
print_log("sheet total row=%d" % total_row)
row_count = 0
for row in range(1, sheet.nrows):
english_field_name = sheet.cell_value(row, self.english_col)
if '\n' in english_field_name:
print_log("row==%d, exist \\n" % row)
# 替换内容中带换行符的,否则google翻译会返回两个结果
english_field_name = str(english_field_name).replace("\n", "")
print_log("row=%d field name = %s" % (row, english_field_name))
next_english_field_name = ""
if row + 1 < sheet.nrows:
# 获取下一行内容
next_english_field_name = sheet.cell_value(row + 1, self.english_col)
translate_text += (english_field_name + "\n")
translate_length = len(translate_text)
row_count += 1
# 要翻译的内容不能带有英文的句号或者问号,字段中存在多句的需要单独翻译
if row == total_row - 1 or '.' in next_english_field_name or '?' in next_english_field_name or '.' in english_field_name or '?' in english_field_name or translate_length >= 512:
print_log("translate_text==" + translate_text)
self.translate_text(language_list, translate_text, row_count)
# 重置数据,准备下次翻译
row_count = 0
translate_text = ""
return language_list
def translate_text(self, language_list, translate_text, row_count):
translate_result = ""
this_translate_list = []
for i in range(0, 5):
translate_result = get_translate(translate_text, self.tl)
if len(translate_result):
try_again = False
if '.' in translate_text or '?' in translate_text:
duplicate_count = 0
result_text = ""
for results in translate_result:
if results is None or results[0] is None:
break
result_text += results[0]
if results[0] in translate_text:
duplicate_count += 1
# 判断是否是真的进行翻译了
if duplicate_count >= 2:
try_again = True
print_log("include '.' and '?',no translate, result_text==" + result_text + ",duplicate_count==" + str(duplicate_count))
else:
duplicate_count = 0
for results in translate_result:
if results[0] is None:
continue
if results[0] in translate_text:
duplicate_count += 1
print_log("duplicate_count==%d, row_count=%d" % (duplicate_count, row_count))
if duplicate_count == row_count:
try_again = True
print_log("no translate duplicate_count==row_count")
if not try_again:
break
time.sleep(1)
if len(translate_result):
if '.' in translate_text or '?' in translate_text:
temp_result = ""
for results in translate_result:
if results is None or results[0] is None:
break
temp_result += results[0]
temp_result = replace_text(temp_result)
language_list.append(temp_result)
print_log(". exist")
this_translate_list.append("")
else:
for results in translate_result:
if results[0] is None:
continue
result = replace_text(results[0])
language_list.append(result)
this_translate_list.append("")
print_log(". not exist")
else:
# 如果是翻译失败,需要用空格替换对应的行,防止结果对不上
print_log("result empty! row_count=%d" % row_count)
for count in range(0, row_count):
print_log("add empty! count=%d" % count)
language_list.append(translate_text)
this_translate_list.append("")
this_len = len(this_translate_list)
print_log("this_len==%d, row_count==%d" % (this_len, row_count))
if this_len != row_count:
raise Exception("translate count not match!")
this_translate_list.clear()
def replace_text(text):
"""
Android资源文件英文的双引号或单引号需要加斜杠,否则会报错,中文的双引号和单引号不需要
:param text:
:return:
"""
temp_text = text.replace(r' \ "', r' \"').replace(r' / ', r'/').replace(r'% ', r' %') \
.replace(r' $ ', r'$').replace(r'$ ', r'$').replace(r'¥ ', r'¥ ').replace(r'¥ ', r'¥ ').replace(r"'", r"\'").replace(r'"', r'\"').replace("\\\\", "\\")
return temp_text
def del_file(_translate_file):
"""
删除文件
"""
is_exists = os.path.exists(_translate_file)
if is_exists:
os.remove(_translate_file)
def start_translate(title, tl, _english_col):
ExcelUtil(title, tl, _english_col).write_to_excel()
if __name__ == '__main__':
# 删除文件
del_file(translate_file)
english_col = get_english_column()
print_log("dict size==" + str(len(translate_dict)))
print_log("english column-->" + str(english_col))
if IS_MULTITHREADING:
for key, value in translate_dict.items():
threading.Thread(target=start_translate, args=(key, value, english_col)).start()
else:
for key, value in translate_dict.items():
start_translate(key, value, english_col)
file.save(translate_file)
# print(replace_text("This cellphone number doesn\\'t exsist, please input again"))
translate_google.py
import requests
import json
import execjs # 必须,需要先用pip 安装,用来执行js脚本
from urllib.parse import quote
# 用来判断是否需要打印日志
debug = True
class Py4Js:
def __init__(self):
self.ctx = execjs.compile("""
function TL(a) {
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = RL(a, $b);
a = RL(a, Zb);
a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
""")
def get_tk(self, text):
return self.ctx.call("TL", text)
def build_url(text, tk, tl='zh-CN'):
"""
需要用转URLEncoder
:param text:
:param tk:
:param tl:
:return:
"""
return 'https://translate.google.cn/translate_a/single?client=webapp&sl=auto&tl=' + tl + '&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&source=btn&ssel=0&tsel=0&kc=0&tk=' \
+ str(tk) + '&q=' + quote(text, encoding='utf-8')
def translate(js, text, tl='zh-CN'):
"""
tl为要翻译的语言
de:德语
ja:日语
sv:瑞典语
nl:荷兰语
ar:阿拉伯语
ko:韩语
pt:葡萄牙语
zh-CN:中文简体
zh-TW:中文繁体
"""
header = {
'authority': 'translate.google.cn',
'method': 'GET',
'path': '',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,ja;q=0.8',
# 'cookie': '_ga=GA1.3.110668007.1547438795; _gid=GA1.3.791931751.1548053917; 1P_JAR=2019-1-23-1; NID=156=biJbQQ3j2gPAJVBfdgBjWHjpC5m9vPqwJ6n6gxTvY8n1eyM8LY5tkYDRsYvacEnWNtMh3ux0-lUJr439QFquSoqEIByw7al6n_yrHqhFNnb5fKyIWMewmqoOJ2fyNaZWrCwl7MA8P_qqPDM5uRIm9SAc5ybSGZijsjalN8YDkxQ',
'cookie':'_ga=GA1.3.110668007.1547438795; _gid=GA1.3.1522575542.1548327032; 1P_JAR=2019-1-24-10; NID=156=ELGmtJHel1YG9Q3RxRI4HTgAc3l1n7Y6PAxGwvecTJDJ2ScgW2p-CXdvh88XFb9dTbYEBkoayWb-2vjJbB-Rhf6auRj-M-2QRUKdZG04lt7ybh8GgffGtepoA4oPN9OO9TeAoWDY0HJHDWCUwCpYzlaQK-gKCh5aVC4HVMeoppI',
# 'cookie': '',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'x-client-data': 'CKi1yQEIhrbJAQijtskBCMG2yQEIqZ3KAQioo8oBCL+nygEI7KfKAQjiqMoBGPmlygE='
}
url = build_url(text, js.get_tk(text), tl)
res = []
try:
r = requests.get(url, headers=header)
result = json.loads(r.text)
r.encoding = "UTF-8"
if debug:
print(r.url)
print(r.headers)
print(r.request.headers)
print(result)
res = result[0]
if res is None:
if result[7] is not None:
# 如果我们文本输错,提示你是不是要找xxx的话,那么重新把xxx正确的翻译之后返回
try:
correct_text = result[7][0].replace('<b><i>', ' ').replace('</i></b>', '')
if debug:
print(correct_text)
correct_url = build_url(correct_text, js.get_tk(correct_text), tl)
correct_response = requests.get(correct_url)
correct_result = json.loads(correct_response.text)
res = correct_result[0]
except Exception as e:
if debug:
print(e)
res = []
except Exception as e:
res = []
if debug:
print(url)
print("翻译" + text + "失败")
print("错误信息:")
print(e)
finally:
return res
def get_translate(word, tl):
js = Py4Js()
translate_result = translate(js, word, tl)
if debug:
print("word== %s, tl== %s" % (word, tl))
print(translate_result)
return translate_result
if __name__ == '__main__':
debug = True
translate_text = '3.Hear voice prompt \"start configuration mode\". click \"reset successfully\" button\n'
results = get_translate(translate_text, 'cs')
translate_result = ""
if "." in translate_text or "?" in translate_text:
for result in results:
translate_result += result[0]
else:
result_translate = results[0]
if debug:
print("translate_result:" + translate_result)
三、运行结果
中文转捷克语
中文简体转中文繁体
参考资料:
1、https://www.jianshu.com/p/95cf6e73d6ee