Пытаюсь спарсить вот эту страничку, мне нужны именно ссылки, но у них нет класса и я никак не могу их достать вот ссылка https://megaholl.ru/catalog/podarki/
там класс unit_goods transition, далее получаете вложенный тег а, там есть пример https://itfy.org/threads/pomogite-razobratsja-s-parsingom.2580/
from bs4 import BeautifulSoup
import requests
# for i in range(1, 13):
# url = f"https://megaholl.ru/catalog/podarki/PAGEN_1={i}"
#
# headers = {"Accept": "*/*",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
#
# req = requests.get(url, headers=headers)
# src = req.text
#
# with open("index.html", "a") as file:
# file.write(src)
with open("index.html") as file:
src = file.read()
soup = BeautifulSoup(src, "lxml")
all_products_hrefs = soup.find(class_="unit_goods transition")
hrefs = all_products_hrefs.find_all("a")
for item in hrefs:
print(item)
print(all_products_hrefs)
import requests
from bs4 import BeautifulSoup
url = "https://megaholl.ru/catalog/podarki/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
page = soup.find_all("div", class_="unit_goods transition")
for i in page:
links = i.find("a", class_="")
print("https://megaholl.ru" + links["href"])
Пытаюсь переместить все это в json файл, но никак не получаетсяКод:import requests from bs4 import BeautifulSoup import json for i in range(1, 14): url = f"https://megaholl.ru/catalog/podarki/?PAGEN_1={i}" response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') page = soup.find_all("div", class_="unit_goods transition") all_categories_dict = {} for links in page: link = links.find("a", class_="") item_href = "https://megaholl.ru" + link["href"] + f"/?PAGEN_1={i}" item_text = link["title"][8:].replace(" купить в Megaholl.ru", "").replace(" купить в Megaholl.ru", "") # all_categories_dict.update({f"{item_text}": f"{item_href}"}) all_categories_dict[item_text] = item_href with open("all_categories_dict.json", "a") as file: json.dump(all_categories_dict, file, indent=4, ensure_ascii=False)
Python:import requests from bs4 import BeautifulSoup url = "https://megaholl.ru/catalog/podarki/" response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') page = soup.find_all("div", class_="unit_goods transition") for i in page: links = i.find("a", class_="") print("https://megaholl.ru" + links["href"])
import requests
from bs4 import BeautifulSoup
import json
dct = {}
for i in range(1, 14):
url = f"https://megaholl.ru/catalog/podarki/?PAGEN_1={i}"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
page = soup.find_all("div", class_="unit_goods transition")
all_categories_dict = {}
for links in page:
link = links.find("a", class_="")
item_href = "https://megaholl.ru" + link["href"] + f"/?PAGEN_1={i}"
item_text = link["title"][8:].replace(" купить в Megaholl.ru", "").replace(" купить в Megaholl.ru", "")
dct.update({item_text: item_href})
with open("all_categories_dict.json", "a") as file:
json.dump(dct, file, indent=4, ensure_ascii=False)
Хорошо, идем дальше переходим на страницу https://megaholl.ru/catalog/fotoramki/derzhatel_dlya_fotografiy_sunburst_kikkerland/ , как мне оттуда достать название и характеристики, я пытался вот так, но не получается
Код:for product_name, product_href in all_products.items(): if count == 0: rep = [",", " ", "'"] for item in rep: if item in product_name: product_name = product_name.replace(item, "_") req = requests.get(url=product_href) src = req.text with open(f"data/{count}_{product_name}.html", "w") as f: f.write(src) with open(f"data/{count}_{product_name}.html") as f: src = f.read() soup = BeautifulSoup(src, "lxml") div = soup.find_all("div", class_="left") for item in div: head = item.find("h2", class_="") specifications = item.find("ul") print(head) print(specifications)