Excel не читает кирилицу, (использую utf-8), один проект работает второй нет

Ayras

Новичок
Пользователь
Июн 26, 2021
23
2
3
Windows 10
python 3.9

Django3.23.2.4
Flask1.1.12.0.1
Jinja22.11.33.0.1
MarkupSafe1.1.12.0.1
Pillow8.2.08.2.0
Werkzeug1.0.12.0.1
asgiref3.3.43.4.0
beautifulsoup44.9.34.9.3
certifi2020.12.52021.5.30
chardet4.0.04.0.0
click7.1.28.0.1
cycler0.10.00.10.0
dump0.0.50.0.5
filelock3.0.123.0.12
grandalf0.70.7
idna2.103.2
image1.5.331.5.33
itsdangerous1.1.02.0.1
joblib0.13.21.0.1
kiwisolver1.3.11.3.1
load2020.12.32020.12.3
lxml4.6.34.6.3
matplotlib3.4.13.4.2
mpmath1.2.11.2.1
netgraph4.0.44.0.4
nose1.3.71.3.7
numpy1.20.21.21.0
packaging20.920.9
pandas1.2.41.3.0rc1
pip21.1.221.1.3
pygame2.0.12.0.1
pyparsing2.4.73.0.0b2
python-dateutil2.8.12.8.1
pytz2021.12021.1
rectangle-packer2.0.12.0.1
regex2021.4.42021.4.4
requests2.25.12.25.1
sacremoses0.0.450.0.45
scipy1.6.31.7.0
selenium3.141.04.0.0.b4
setuptools56.0.057.0.0
six1.15.01.16.0
soupsieve2.2.12.2.1
sqlparse0.4.10.4.1
sympy1.81.8
tika1.191.24
tokenizers0.10.20.10.3
torch1.8.11.9.0
tqdm4.32.24.61.1
transformers4.5.14.8.1
typing-extensions3.10.0.03.10.0.0
urllib31.26.41.26.6
wget3.23.2
xlrd2.0.12.0.1
xmltodict0.12.00.12.0
Python:
# This is the way
# Author: pythontoday
# YouTube: https://www.youtube.com/c/PythonToday/videos


import requests
from bs4 import BeautifulSoup
import json
import csv


# url = "https://health-diet.ru/table_calorie/?utm_source=leftMenu&utm_medium=table_calorie"
#
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"

}
#
# req = requests.get(url, headers=headers)
# src = req.text
# print(src)

# with open("index.html", "w") as file:
#     file.write(src)

# with open("index.html") as file:
#     src = file.read()
#
# soup = BeautifulSoup(src, "lxml")
# all_products_hrefs = soup.find_all(class_="mzr-tc-group-item-href")
#
# all_categories_dict = {}
# for item in all_products_hrefs:
#     item_text = item.text
#     item_href = "https://health-diet.ru" + item.get("href")
#
#     all_categories_dict[item_text] = item_href
#
# with open("all_categories_dict.json", "w") as file:
#     json.dump(all_categories_dict, file, indent=4, ensure_ascii=False)

with open("all_categories_dict.json") as file:
    all_categories = json.load(file)

iteration_count = int(len(all_categories)) - 1
count = 0
print(f"Всего итераций: {iteration_count}")

for category_name, category_href in all_categories.items():

    rep = [",", " ", "-", "'"]
    for item in rep:
        if item in category_name:
            category_name = category_name.replace(item, "_")

    req = requests.get(url=category_href, headers=headers)
    src = req.text

    with open(f"data/{count}_{category_name}.html", "w", encoding="utf-8") as file:
        file.write(src)

    with open(f"data/{count}_{category_name}.html", encoding="utf-8") as file:
        src = file.read()

    soup = BeautifulSoup(src, "lxml")

    # проверка страницы на наличие таблицы с продуктами
    alert_block = soup.find(class_="uk-alert-danger")
    if alert_block is not None:
        continue

    # собираем заголовки таблицы
    table_head = soup.find(class_="mzr-tc-group-table").find("tr").find_all("th")
    product = table_head[0].text
    calories = table_head[1].text
    proteins = table_head[2].text
    fats = table_head[3].text
    carbohydrates = table_head[4].text

    with open(f"data/{count}_{category_name}.csv", "w", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(
            (
                product,
                calories,
                proteins,
                fats,
                carbohydrates
            )
        )

    # собираем данные продуктов
    products_data = soup.find(class_="mzr-tc-group-table").find("tbody").find_all("tr")

    product_info = []
    for item in products_data:
        product_tds = item.find_all("td")

        title = product_tds[0].find("a").text
        calories = product_tds[1].text
        proteins = product_tds[2].text
        fats = product_tds[3].text
        carbohydrates = product_tds[4].text

        product_info.append(
            {
                "Title": title,
                "Calories": calories,
                "Proteins": proteins,
                "Fats": fats,
                "Carbohydrates": carbohydrates
            }
        )

        with open(f"data/{count}_{category_name}.csv", "a", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(
                (
                    title,
                    calories,
                    proteins,
                    fats,
                    carbohydrates
                )
            )
    with open(f"data/{count}_{category_name}.json", "a", encoding="utf-8") as file:
        json.dump(product_info, file, indent=4, ensure_ascii=False)

    count += 1
    print(f"# Итерация {count}. {category_name} записан...")
    iteration_count = iteration_count - 1

    if iteration_count == 0:
        print("Работа завершена")
        break

    print(f"Осталось итераций: {iteration_count}")


81 +116 строка записывает в csv
 

Вложения

  • lesson2.zip
    2,6 КБ · Просмотры: 2

Форум IT Специалистов