Excel не читает кирилицу, (использую utf-8), один проект работает второй нет

Ayras · Июн 29, 2021

Windows 10
python 3.9

Django	3.2	3.2.4
Flask	1.1.1	2.0.1
Jinja2	2.11.3	3.0.1
MarkupSafe	1.1.1	2.0.1
Pillow	8.2.0	8.2.0
Werkzeug	1.0.1	2.0.1
asgiref	3.3.4	3.4.0
beautifulsoup4	4.9.3	4.9.3
certifi	2020.12.5	2021.5.30
chardet	4.0.0	4.0.0
click	7.1.2	8.0.1
cycler	0.10.0	0.10.0
dump	0.0.5	0.0.5
filelock	3.0.12	3.0.12
grandalf	0.7	0.7
idna	2.10	3.2
image	1.5.33	1.5.33
itsdangerous	1.1.0	2.0.1
joblib	0.13.2	1.0.1
kiwisolver	1.3.1	1.3.1
load	2020.12.3	2020.12.3
lxml	4.6.3	4.6.3
matplotlib	3.4.1	3.4.2
mpmath	1.2.1	1.2.1
netgraph	4.0.4	4.0.4
nose	1.3.7	1.3.7
numpy	1.20.2	1.21.0
packaging	20.9	20.9
pandas	1.2.4	1.3.0rc1
pip	21.1.2	21.1.3
pygame	2.0.1	2.0.1
pyparsing	2.4.7	3.0.0b2
python-dateutil	2.8.1	2.8.1
pytz	2021.1	2021.1
rectangle-packer	2.0.1	2.0.1
regex	2021.4.4	2021.4.4
requests	2.25.1	2.25.1
sacremoses	0.0.45	0.0.45
scipy	1.6.3	1.7.0
selenium	3.141.0	4.0.0.b4
setuptools	56.0.0	57.0.0
six	1.15.0	1.16.0
soupsieve	2.2.1	2.2.1
sqlparse	0.4.1	0.4.1
sympy	1.8	1.8
tika	1.19	1.24
tokenizers	0.10.2	0.10.3
torch	1.8.1	1.9.0
tqdm	4.32.2	4.61.1
transformers	4.5.1	4.8.1
typing-extensions	3.10.0.0	3.10.0.0
urllib3	1.26.4	1.26.6
wget	3.2	3.2
xlrd	2.0.1	2.0.1
xmltodict	0.12.0	0.12.0

Python:

# This is the way
# Author: pythontoday
# YouTube: https://www.youtube.com/c/PythonToday/videos


import requests
from bs4 import BeautifulSoup
import json
import csv


# url = "https://health-diet.ru/table_calorie/?utm_source=leftMenu&utm_medium=table_calorie"
#
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"

}
#
# req = requests.get(url, headers=headers)
# src = req.text
# print(src)

# with open("index.html", "w") as file:
#     file.write(src)

# with open("index.html") as file:
#     src = file.read()
#
# soup = BeautifulSoup(src, "lxml")
# all_products_hrefs = soup.find_all(class_="mzr-tc-group-item-href")
#
# all_categories_dict = {}
# for item in all_products_hrefs:
#     item_text = item.text
#     item_href = "https://health-diet.ru" + item.get("href")
#
#     all_categories_dict[item_text] = item_href
#
# with open("all_categories_dict.json", "w") as file:
#     json.dump(all_categories_dict, file, indent=4, ensure_ascii=False)

with open("all_categories_dict.json") as file:
    all_categories = json.load(file)

iteration_count = int(len(all_categories)) - 1
count = 0
print(f"Всего итераций: {iteration_count}")

for category_name, category_href in all_categories.items():

    rep = [",", " ", "-", "'"]
    for item in rep:
        if item in category_name:
            category_name = category_name.replace(item, "_")

    req = requests.get(url=category_href, headers=headers)
    src = req.text

    with open(f"data/{count}_{category_name}.html", "w", encoding="utf-8") as file:
        file.write(src)

    with open(f"data/{count}_{category_name}.html", encoding="utf-8") as file:
        src = file.read()

    soup = BeautifulSoup(src, "lxml")

    # проверка страницы на наличие таблицы с продуктами
    alert_block = soup.find(class_="uk-alert-danger")
    if alert_block is not None:
        continue

    # собираем заголовки таблицы
    table_head = soup.find(class_="mzr-tc-group-table").find("tr").find_all("th")
    product = table_head[0].text
    calories = table_head[1].text
    proteins = table_head[2].text
    fats = table_head[3].text
    carbohydrates = table_head[4].text

    with open(f"data/{count}_{category_name}.csv", "w", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(
            (
                product,
                calories,
                proteins,
                fats,
                carbohydrates
            )
        )

    # собираем данные продуктов
    products_data = soup.find(class_="mzr-tc-group-table").find("tbody").find_all("tr")

    product_info = []
    for item in products_data:
        product_tds = item.find_all("td")

        title = product_tds[0].find("a").text
        calories = product_tds[1].text
        proteins = product_tds[2].text
        fats = product_tds[3].text
        carbohydrates = product_tds[4].text

        product_info.append(
            {
                "Title": title,
                "Calories": calories,
                "Proteins": proteins,
                "Fats": fats,
                "Carbohydrates": carbohydrates
            }
        )

        with open(f"data/{count}_{category_name}.csv", "a", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(
                (
                    title,
                    calories,
                    proteins,
                    fats,
                    carbohydrates
                )
            )
    with open(f"data/{count}_{category_name}.json", "a", encoding="utf-8") as file:
        json.dump(product_info, file, indent=4, ensure_ascii=False)

    count += 1
    print(f"# Итерация {count}. {category_name} записан...")
    iteration_count = iteration_count - 1

    if iteration_count == 0:
        print("Работа завершена")
        break

    print(f"Осталось итераций: {iteration_count}")

81 +116 строка записывает в csv

Поиск

Поиск

Excel не читает кирилицу, (использую utf-8), один проект работает второй нет

Ayras

Новичок

Вложения