diff --git a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py index 54d8880..784fb6a 100644 --- a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py +++ b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py @@ -5,23 +5,7 @@ from vacancies.main.vector_store import add_vectors, extract_features, client as clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123) -next_page_offset = 0 -exist_points_ids = [] -while next_page_offset is not None: - response = qdrant.scroll( - collection_name="vacancies", - limit=100_000, - offset=next_page_offset, - with_payload=False, - with_vectors=False, - timeout=30, - ) - exist_points_ids.extend([point.id for point in response[0]]) - next_page_offset = response[1] -exist_points_set = tuple(exist_points_ids) -print("qdrant vacancies points count:", len(exist_points_set)) - -query = f""" +query = """ SELECT id, chat_username, telegram_id, message, timestamp FROM telegram_parser_chatmessage WHERE timestamp >= now() - INTERVAL 30 DAY @@ -33,7 +17,7 @@ WHERE timestamp >= now() - INTERVAL 30 DAY 'удаленно', 'гибкий график', 'полный день', 'частичная занятость', 'резюме', 'собеседование', 'junior', 'middle', 'senior' ]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0 - AND id NOT IN {exist_points_set} + AND id NOT IN %(exist_points)s """ @@ -41,28 +25,36 @@ class Command(BaseCommand): help = "Collect vacancies from telegram messages" def handle(self, *args, **options): - import time - start_time = time.time() - result_rows = clickhouse_client.query(query).result_rows - print(f"query time: {time.time() - start_time:.4f}") + next_page_offset = 0 + exist_points_ids = [-1] + while next_page_offset is not None: + response = qdrant.scroll( + collection_name="vacancies", + limit=100_000, + offset=next_page_offset, + with_payload=False, + with_vectors=False, + timeout=30, + ) + exist_points_ids.extend([point.id for point in response[0]]) + next_page_offset = response[1] + exist_points_set = tuple(set(exist_points_ids)) + + result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows result_rows_len = len(result_rows) for index, row in enumerate(result_rows): (id, chat_username, telegram_id, message, timestamp) = row link = f"https://t.me/{chat_username}/{telegram_id}" print(f"Processing {index}/{result_rows_len} link: {link}") - start_time = time.time() features = extract_features(message) - print(f"ai time: {time.time() - start_time:.4f}") vacancy, created = Vacancy.objects.get_or_create( link=link, ) - start_time = time.time() add_vectors( "vacancies", vacancy.id, features.model_dump(), {'content': message, 'features_json': features.model_dump()}, ) - print(f"write vector time: {time.time() - start_time:.4f}") diff --git a/vacancies/main/management/commands/generate_recommended_vacancies.py b/vacancies/main/management/commands/generate_recommended_vacancies.py index 977cece..f889d5e 100644 --- a/vacancies/main/management/commands/generate_recommended_vacancies.py +++ b/vacancies/main/management/commands/generate_recommended_vacancies.py @@ -1,6 +1,7 @@ import asyncio + from django.core.management import BaseCommand -from vacancies.main.vector_store import search_similarities, client +from vacancies.main.vector_store import search_similarities from vacancies.main.models import CustomerCV, RecommendedVacancy from vacancies.main.bot import application from telegram import InlineKeyboardButton, InlineKeyboardMarkup @@ -19,12 +20,7 @@ class Command(BaseCommand): ).values_list('vacancy_id', flat=True) query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)]) - search_result_id = search_similarities(query_filter, customer_cv.id) - - vacancy_content = client.retrieve( - collection_name="vacancies", - ids=[search_result_id], - )[0].payload["content"] + search_result_id, vacancy_content = search_similarities(query_filter, customer_cv.id) recommendation = RecommendedVacancy.objects.create( customer=customer_cv.customer, diff --git a/vacancies/main/models.py b/vacancies/main/models.py index 41ac3d4..836117b 100644 --- a/vacancies/main/models.py +++ b/vacancies/main/models.py @@ -62,6 +62,7 @@ class RecommendedVacancy(models.Model): class VacancyFeatures(BaseModel): + job_title: str | None = None # Должность employment_type: str | None = None # Тип занятости work_format: str | None = None # Формат работы experience: str | None = None # Опыт работы diff --git a/vacancies/main/vector_store.py b/vacancies/main/vector_store.py index 2a736dd..82e43d1 100644 --- a/vacancies/main/vector_store.py +++ b/vacancies/main/vector_store.py @@ -9,15 +9,16 @@ from vacancies.main.models import VacancyFeatures client = QdrantClient(url="http://localhost:6333") FEATURE_NAMES = [ - "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack", + "job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack", "location", "salary_range", "languages", "education", "schedule", "additional_requirements" ] weights = { + "job_title": 10, "employment_type": 2, "work_format": 2, - "experience": 4, - "position_level": 4, + "experience": 3, + "position_level": 5, "industry": 4, "tech_stack": 5, "location": 2, @@ -87,35 +88,40 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]: )[0].vector max_similarities = {} + vacancies_content = {} for name, vec in vectors.items(): if any(v != 0 for v in vec): - results = client.search( + results = client.query_points( collection_name="vacancies", - query_vector=(name, vec), + query=vec, + using=name, limit=1000, with_payload=True, query_filter=query_filter, ) - for res in results: + for res in results.points: vid = res.id sim = res.score if vid not in max_similarities: max_similarities[vid] = {} max_similarities[vid][name] = sim + if vid not in vacancies_content: + vacancies_content[vid] = res.payload["content"] scored = [] for vid, feature_sims in max_similarities.items(): total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims) - scored.append({"id": vid, "score": total}) + scored.append({"id": vid, "score": total, "content": vacancies_content[vid]}) scored.sort(key=lambda x: x["score"], reverse=True) - return scored[0]["id"] + return scored[0]["id"], scored[0]["content"] def extract_features(content: str) -> VacancyFeatures: prompt = f""" Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. Features: + - job_title: Должность (e.g., DevOps, Python программист) - employment_type: Тип занятости (e.g., Полная занятость, Частичная) - work_format: Формат работы (e.g., Офис, Удалённо, Гибрид) - experience: Опыт работы (e.g., 3-5 лет, Нет опыта)