diff --git a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py index 408f050..54d8880 100644 --- a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py +++ b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py @@ -1,11 +1,27 @@ from django.core.management import BaseCommand from vacancies.main.models import Vacancy import clickhouse_connect -from vacancies.main.vector_store import add_vectors, extract_features +from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123) -query = """ +next_page_offset = 0 +exist_points_ids = [] +while next_page_offset is not None: + response = qdrant.scroll( + collection_name="vacancies", + limit=100_000, + offset=next_page_offset, + with_payload=False, + with_vectors=False, + timeout=30, + ) + exist_points_ids.extend([point.id for point in response[0]]) + next_page_offset = response[1] +exist_points_set = tuple(exist_points_ids) +print("qdrant vacancies points count:", len(exist_points_set)) + +query = f""" SELECT id, chat_username, telegram_id, message, timestamp FROM telegram_parser_chatmessage WHERE timestamp >= now() - INTERVAL 30 DAY @@ -17,6 +33,7 @@ WHERE timestamp >= now() - INTERVAL 30 DAY 'удаленно', 'гибкий график', 'полный день', 'частичная занятость', 'резюме', 'собеседование', 'junior', 'middle', 'senior' ]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0 + AND id NOT IN {exist_points_set} """