#!/usr/bin/env python3 import requests import re from pyquery import PyQuery as pq BASEURL = "https://www.avito.ru" urls = ["https://www.avito.ru/rossiya/noutbuki", "https://www.avito.ru/sankt-peterburg/noutbuki", "https://www.avito.ru/moskva/noutbuki", "https://www.avito.ru/sankt-peterburg/tovary_dlya_kompyutera/komplektuyuschie/zhestkie_diski", "https://www.avito.ru/sankt-peterburg/tovary_dlya_kompyutera/perenosnye_zhestkie_diski", ] regexes = {"[xXхХ]220": {}, "[xXхХ] 220": {}, r"^(?!.*(зарядн|салазк|адапте|ultrabook|mac|шлейф|ультрабу|игровой|ide)).*(usb|внешни)": {'price_max': 3000}, } class Adtuple(tuple): def __str__(self): return "{}\t{}\t{}".format(self[1], self[0], BASEURL+self[2]) def safe_cast(val, to_type, default=0): try: val = val.replace(' руб.', '').replace(' ', '') return to_type(val) except (ValueError, TypeError): return default def find_good(adlist): good_list = [] for ad in adlist: #print(ad[0]) for regex in regexes: if re.search(regex, ad[0], re.IGNORECASE): if (regexes[regex].get('price_min') and (safe_cast(ad[1], int) >= regexes[regex].get('price_min'))) or \ (regexes[regex].get('price_max') and (safe_cast(ad[1], int) <= regexes[regex].get('price_max'))): #if re.search("[Ii]ntel", ad[0]): good_list.append(ad) return good_list for url in urls: tree = pq(url) posts = tree("div.description") ads = [] for post in posts: title = pq(post)('h3.title>a').text().strip() link = pq(post)('h3.title>a').attr('href') price = pq(post)('div.about').text().strip() ads.append(Adtuple((title, price, link))) for good_ad in find_good(ads): print(good_ad)