Windows 10
python 3.9
81 +116 строка записывает в csv
python 3.9
Django | 3.2 | 3.2.4 |
Flask | 1.1.1 | 2.0.1 |
Jinja2 | 2.11.3 | 3.0.1 |
MarkupSafe | 1.1.1 | 2.0.1 |
Pillow | 8.2.0 | 8.2.0 |
Werkzeug | 1.0.1 | 2.0.1 |
asgiref | 3.3.4 | 3.4.0 |
beautifulsoup4 | 4.9.3 | 4.9.3 |
certifi | 2020.12.5 | 2021.5.30 |
chardet | 4.0.0 | 4.0.0 |
click | 7.1.2 | 8.0.1 |
cycler | 0.10.0 | 0.10.0 |
dump | 0.0.5 | 0.0.5 |
filelock | 3.0.12 | 3.0.12 |
grandalf | 0.7 | 0.7 |
idna | 2.10 | 3.2 |
image | 1.5.33 | 1.5.33 |
itsdangerous | 1.1.0 | 2.0.1 |
joblib | 0.13.2 | 1.0.1 |
kiwisolver | 1.3.1 | 1.3.1 |
load | 2020.12.3 | 2020.12.3 |
lxml | 4.6.3 | 4.6.3 |
matplotlib | 3.4.1 | 3.4.2 |
mpmath | 1.2.1 | 1.2.1 |
netgraph | 4.0.4 | 4.0.4 |
nose | 1.3.7 | 1.3.7 |
numpy | 1.20.2 | 1.21.0 |
packaging | 20.9 | 20.9 |
pandas | 1.2.4 | 1.3.0rc1 |
pip | 21.1.2 | 21.1.3 |
pygame | 2.0.1 | 2.0.1 |
pyparsing | 2.4.7 | 3.0.0b2 |
python-dateutil | 2.8.1 | 2.8.1 |
pytz | 2021.1 | 2021.1 |
rectangle-packer | 2.0.1 | 2.0.1 |
regex | 2021.4.4 | 2021.4.4 |
requests | 2.25.1 | 2.25.1 |
sacremoses | 0.0.45 | 0.0.45 |
scipy | 1.6.3 | 1.7.0 |
selenium | 3.141.0 | 4.0.0.b4 |
setuptools | 56.0.0 | 57.0.0 |
six | 1.15.0 | 1.16.0 |
soupsieve | 2.2.1 | 2.2.1 |
sqlparse | 0.4.1 | 0.4.1 |
sympy | 1.8 | 1.8 |
tika | 1.19 | 1.24 |
tokenizers | 0.10.2 | 0.10.3 |
torch | 1.8.1 | 1.9.0 |
tqdm | 4.32.2 | 4.61.1 |
transformers | 4.5.1 | 4.8.1 |
typing-extensions | 3.10.0.0 | 3.10.0.0 |
urllib3 | 1.26.4 | 1.26.6 |
wget | 3.2 | 3.2 |
xlrd | 2.0.1 | 2.0.1 |
xmltodict | 0.12.0 | 0.12.0 |
Python:
# This is the way
# Author: pythontoday
# YouTube: https://www.youtube.com/c/PythonToday/videos
import requests
from bs4 import BeautifulSoup
import json
import csv
# url = "https://health-diet.ru/table_calorie/?utm_source=leftMenu&utm_medium=table_calorie"
#
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
#
# req = requests.get(url, headers=headers)
# src = req.text
# print(src)
# with open("index.html", "w") as file:
# file.write(src)
# with open("index.html") as file:
# src = file.read()
#
# soup = BeautifulSoup(src, "lxml")
# all_products_hrefs = soup.find_all(class_="mzr-tc-group-item-href")
#
# all_categories_dict = {}
# for item in all_products_hrefs:
# item_text = item.text
# item_href = "https://health-diet.ru" + item.get("href")
#
# all_categories_dict[item_text] = item_href
#
# with open("all_categories_dict.json", "w") as file:
# json.dump(all_categories_dict, file, indent=4, ensure_ascii=False)
with open("all_categories_dict.json") as file:
all_categories = json.load(file)
iteration_count = int(len(all_categories)) - 1
count = 0
print(f"Всего итераций: {iteration_count}")
for category_name, category_href in all_categories.items():
rep = [",", " ", "-", "'"]
for item in rep:
if item in category_name:
category_name = category_name.replace(item, "_")
req = requests.get(url=category_href, headers=headers)
src = req.text
with open(f"data/{count}_{category_name}.html", "w", encoding="utf-8") as file:
file.write(src)
with open(f"data/{count}_{category_name}.html", encoding="utf-8") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
# проверка страницы на наличие таблицы с продуктами
alert_block = soup.find(class_="uk-alert-danger")
if alert_block is not None:
continue
# собираем заголовки таблицы
table_head = soup.find(class_="mzr-tc-group-table").find("tr").find_all("th")
product = table_head[0].text
calories = table_head[1].text
proteins = table_head[2].text
fats = table_head[3].text
carbohydrates = table_head[4].text
with open(f"data/{count}_{category_name}.csv", "w", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
product,
calories,
proteins,
fats,
carbohydrates
)
)
# собираем данные продуктов
products_data = soup.find(class_="mzr-tc-group-table").find("tbody").find_all("tr")
product_info = []
for item in products_data:
product_tds = item.find_all("td")
title = product_tds[0].find("a").text
calories = product_tds[1].text
proteins = product_tds[2].text
fats = product_tds[3].text
carbohydrates = product_tds[4].text
product_info.append(
{
"Title": title,
"Calories": calories,
"Proteins": proteins,
"Fats": fats,
"Carbohydrates": carbohydrates
}
)
with open(f"data/{count}_{category_name}.csv", "a", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(
(
title,
calories,
proteins,
fats,
carbohydrates
)
)
with open(f"data/{count}_{category_name}.json", "a", encoding="utf-8") as file:
json.dump(product_info, file, indent=4, ensure_ascii=False)
count += 1
print(f"# Итерация {count}. {category_name} записан...")
iteration_count = iteration_count - 1
if iteration_count == 0:
print("Работа завершена")
break
print(f"Осталось итераций: {iteration_count}")
81 +116 строка записывает в csv