乌尔都语
代码如下:
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "Locense Urdu audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-urdu"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 3:
continue
td_en, td_ru, td_pron = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')#ru,代表语言
pron = td_pron.get_text(strip=True)
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"group_num": group_idx,
"category": cate_name, # 补充分类名进单条数据
"english": en,
"language": ru, # 原ru → 修改为表头的language
"pronunciation": pron,
"mp3_filename": full_mp3_url #原ru_mp3_filename → mp3_filename
}
temp_words.append(word_info)
# 构造文件名:
save_name = f"{group_idx},{en},{ru},{pron}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("乌尔都语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","language","pronunciation","mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
阿拉伯语
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "Locense Arab_mp3_audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-arabic"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 3:
continue
td_en, td_ru, td_pron = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')#ru,代表语言
pron = td_pron.get_text(strip=True)
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"group_num": group_idx,
"category": cate_name, # 补充分类名进单条数据
"english": en,
"language": ru, # 原ru → 修改为表头的language
"pronunciation": pron,
"mp3_filename": full_mp3_url #原ru_mp3_filename → mp3_filename
}
temp_words.append(word_info)
# 构造文件名:
save_name = f"{group_idx},{en},{ru},{pron}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("阿拉伯语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","language","pronunciation","mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
日语
Japanese, 表格是三列,用如下代码:
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "Locense mp3_audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-japanese"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 3:
continue
td_en, td_ru, td_pron = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')#ru,代表语言
pron = td_pron.get_text(strip=True)
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"group_num": group_idx,
"category": cate_name, # 补充分类名进单条数据
"english": en,
"language": ru, # 原ru → 修改为表头的language
"pronunciation": pron,
"mp3_filename": full_mp3_url #原ru_mp3_filename → mp3_filename
}
temp_words.append(word_info)
# 构造文件名:
save_name = f"{group_idx},{en},{ru},{pron}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("日语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","language","pronunciation","mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
俄语
Russian, 表格是三列。代码如下
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "ru_mp3_audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-russian#in-case-of-trouble"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
# ===========================================================
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 3:
continue
td_en, td_ru, td_pron = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')
pron = td_pron.get_text(strip=True)
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"english": en,
"russian": ru,
"pronunciation": pron,
"ru_mp3_filename": full_mp3_url,
"group_num": group_idx
}
temp_words.append(word_info)
# 构造文件名:"1,Hello,Здравствуйте,Zdravstvuyte.mp3"
save_name = f"{group_idx},{en},{ru},{pron}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("俄语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","russian","pronunciation","ru_mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
在loecsen网站上下载相关语言学习音频资料。
西班牙语
代码如下
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "Locense Spanish mp3_audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-spanish"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 2:
continue
td_en, td_ru = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')#ru,代表语言
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"group_num": group_idx,
"category": cate_name, # 补充分类名进单条数据
"english": en,
"language": ru, # 原ru → 修改为表头的language
"mp3_filename": full_mp3_url #原ru_mp3_filename → mp3_filename
}
temp_words.append(word_info)
# 构造文件名:
save_name = f"{group_idx},{en},{ru}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("西班牙语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","language","mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
德语
对于该网站中的德语音频,因为网页中表格是两列,所以对代码略有调整,
如: if len(tds) != 2: continue。删去不必要的第三列信息。
如下:
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "Locense mp3_audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-german"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 2:
continue
td_en, td_ru = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')#ru,代表语言
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"group_num": group_idx,
"category": cate_name, # 补充分类名进单条数据
"english": en,
"language": ru, # 原ru → 修改为表头的language
"mp3_filename": full_mp3_url #原ru_mp3_filename → mp3_filename
}
temp_words.append(word_info)
# 构造文件名:
save_name = f"{group_idx},{en},{ru}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("德语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","language","mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
法语
代码如下:
python
import requests
import re
from bs4 import BeautifulSoup as bs
import os
# 创建mp3保存文件夹
save_dir = "Locense Franch mp3_audio"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
# 配置参数
BASE_URL = "https://www.loecsen.com/en/vocabulary-french"
AUDIO_PREFIX = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/"
SAVE_FOLDER = "russian_audio"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
base_mp3_url = "https://www.loecsen.com/OrizonFlash_V2/ressources/son/" # 替换成mp3真实域名,拼接ru_mp3_filename
def get_page_html():
"""获取页面源码"""
resp = requests.get(BASE_URL, headers=HEADERS)
resp.raise_for_status()
resp.encoding = "utf-8"
return resp.text
html = get_page_html()
soup = bs(html, 'html.parser')
all_data = []
topr_all = soup.select('div.topr')
# 遍历每一组topr,group_idx=组号(从1开始)
for group_idx, block in enumerate(topr_all, start=1):
colonnes_div = block.select_one('div.colonnes')
if not colonnes_div:
continue
cate_name = colonnes_div.find('span').get_text(strip=True)
voc_wrap = block.select_one('div.contentfiches.voc')
if not voc_wrap:
all_data.append({"category": cate_name, "words": []})
continue
word_rows = voc_wrap.select('tr[data-id]')
temp_words = []
for tr in word_rows:
tds = tr.find_all('td')
if len(tds) != 2:
continue
td_en, td_ru = tds
mp3_file_name = td_ru.get('data-id', '')
en = td_en.get_text(strip=True).replace('🔊\xa0', '')
ru = td_ru.get_text(strip=True).replace('🔊\xa0', '')#ru,代表语言
# 拼接mp3完整下载链接
full_mp3_url = base_mp3_url + mp3_file_name
word_info = {
"group_num": group_idx,
"category": cate_name, # 补充分类名进单条数据
"english": en,
"language": ru, # 原ru → 修改为表头的language
"mp3_filename": full_mp3_url #原ru_mp3_filename → mp3_filename
}
temp_words.append(word_info)
# 构造文件名:
save_name = f"{group_idx},{en},{ru}.mp3"
save_path = os.path.join(save_dir, save_name)
# 下载mp3
try:
resp = requests.get(full_mp3_url, timeout=15)
if resp.status_code == 200:
with open(save_path, "wb") as f:
f.write(resp.content)
print(f"✅已下载:{save_name}")
else:
print(f"❌链接失效:{full_mp3_url}")
except Exception as e:
print(f"⚠️下载失败 {save_name} ,错误:{str(e)}")
all_data.append({
"category": cate_name,
"words": temp_words
})
# 导出csv
import csv
with open("法语词汇汇总.csv", "w", encoding="utf-8-sig", newline="") as f:
headers = ["group_num","category","english","language","mp3_filename"]
csv_w = csv.DictWriter(f, fieldnames=headers)
csv_w.writeheader()
for group in all_data:
for word_info in group["words"]:
csv_w.writerow(word_info)
print("\n📄csv文件导出完成")
即可完成。