diff --git a/vacancies/main/bot.py b/vacancies/main/bot.py index cf34605..732c5d8 100644 --- a/vacancies/main/bot.py +++ b/vacancies/main/bot.py @@ -8,6 +8,7 @@ from vacancies.main.models import Customer, CustomerCV from langchain.agents import create_agent from langchain_openai import ChatOpenAI from langgraph.checkpoint.memory import InMemorySaver +from vacancies.main.vector_store import add_vectors, extract_features SYSTEM_PROMPT = """ Ты — карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры. @@ -63,7 +64,7 @@ async def error_handler(update: object, context: ContextTypes.DEFAULT_TYPE) -> N async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE): if not update.message.document: await context.bot.send_message(chat_id=update.effective_chat.id, text="Не удалось прочитать информацию из файла! Попробуйте другой формат.") - return + return buffer = io.BytesIO() file = await update.message.document.get_file() @@ -75,6 +76,13 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE): customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict( content=resume, )) + features = extract_features(customer_cv.content) + add_vectors( + "cvs", + customer_cv.id, + features.model_dump(), + {'content': customer_cv.content, 'features_json': features.model_dump()}, + ) await context.bot.send_message(chat_id=update.effective_chat.id, text="Отлично! Запомнил Ваше резюме.") diff --git a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py index 88ff7e0..408f050 100644 --- a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py +++ b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py @@ -1,7 +1,7 @@ from django.core.management import BaseCommand from vacancies.main.models import Vacancy import clickhouse_connect -from vacancies.main.vector_store import add_vacancy_vectors, extract_vacancy_features +from vacancies.main.vector_store import add_vectors, extract_features clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123) @@ -24,15 +24,28 @@ class Command(BaseCommand): help = "Collect vacancies from telegram messages" def handle(self, *args, **options): - for index, row in enumerate(clickhouse_client.query(query).result_rows): + import time + start_time = time.time() + result_rows = clickhouse_client.query(query).result_rows + print(f"query time: {time.time() - start_time:.4f}") + result_rows_len = len(result_rows) + for index, row in enumerate(result_rows): (id, chat_username, telegram_id, message, timestamp) = row link = f"https://t.me/{chat_username}/{telegram_id}" - features = extract_vacancy_features(message) + print(f"Processing {index}/{result_rows_len} link: {link}") + start_time = time.time() + features = extract_features(message) + print(f"ai time: {time.time() - start_time:.4f}") vacancy, created = Vacancy.objects.get_or_create( link=link, - defaults={'content': message, 'features_json': features.model_dump()} ) - print(index, link) - add_vacancy_vectors(vacancy.id, features.model_dump(), {"link": link}) + start_time = time.time() + add_vectors( + "vacancies", + vacancy.id, + features.model_dump(), + {'content': message, 'features_json': features.model_dump()}, + ) + print(f"write vector time: {time.time() - start_time:.4f}") diff --git a/vacancies/main/management/commands/generate_recommended_vacancies.py b/vacancies/main/management/commands/generate_recommended_vacancies.py index 8cc75be..51f2bb5 100644 --- a/vacancies/main/management/commands/generate_recommended_vacancies.py +++ b/vacancies/main/management/commands/generate_recommended_vacancies.py @@ -1,8 +1,8 @@ +import asyncio from django.core.management import BaseCommand from vacancies.main.vector_store import search_similarities from vacancies.main.models import CustomerCV, RecommendedVacancy from vacancies.main.bot import application -from vacancies.main.features_extractor import extract_vacancy_features from telegram import InlineKeyboardButton, InlineKeyboardMarkup from qdrant_client.models import Filter, HasIdCondition @@ -14,23 +14,22 @@ class Command(BaseCommand): customer_cvs = CustomerCV.objects.all() for customer_cv in customer_cvs: - features = extract_vacancy_features(customer_cv.content) recommended_vacancy_ids = RecommendedVacancy.objects.filter( customer=customer_cv.customer, ).values_list('vacancy_id', flat=True) query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)]) - search_result_id = search_similarities(features.model_dump(), query_filter) + search_result_id = search_similarities(query_filter, customer_cv.id) recommendation = RecommendedVacancy.objects.create( customer=customer_cv.customer, vacancy_id=search_result_id, ) - application.bot.send_message( + asyncio.run(application.bot.send_message( chat_id=recommendation.customer.chat_id, text=recommendation.vacancy.content, reply_markup=InlineKeyboardMarkup([[ InlineKeyboardButton("Откликнуться", url=recommendation.vacancy.link), ]]), - ) + )) diff --git a/vacancies/main/migrations/0006_remove_vacancy_features_json.py b/vacancies/main/migrations/0006_remove_vacancy_features_json.py new file mode 100644 index 0000000..c3d55f7 --- /dev/null +++ b/vacancies/main/migrations/0006_remove_vacancy_features_json.py @@ -0,0 +1,17 @@ +# Generated by Django 5.2.7 on 2025-10-26 16:14 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0005_remove_vacancy_is_processed'), + ] + + operations = [ + migrations.RemoveField( + model_name='vacancy', + name='features_json', + ), + ] diff --git a/vacancies/main/models.py b/vacancies/main/models.py index b1bde3c..41ac3d4 100644 --- a/vacancies/main/models.py +++ b/vacancies/main/models.py @@ -34,7 +34,6 @@ class CustomerCV(models.Model): class Vacancy(models.Model): content = models.TextField() link = models.URLField(unique=True) - features_json = models.JSONField(null=True, blank=True) created_at = models.DateTimeField(auto_now_add=True) objects = models.Manager() diff --git a/vacancies/main/vector_store.py b/vacancies/main/vector_store.py index f28df2d..25ee98c 100644 --- a/vacancies/main/vector_store.py +++ b/vacancies/main/vector_store.py @@ -17,10 +17,14 @@ vectors_config = { name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES } -collection_name = "vacancies" -if not client.collection_exists(collection_name): +if not client.collection_exists("vacancies"): client.create_collection( - collection_name=collection_name, + collection_name="vacancies", + vectors_config=vectors_config + ) +if not client.collection_exists("cvs"): + client.create_collection( + collection_name="cvs", vectors_config=vectors_config ) @@ -39,7 +43,7 @@ def _prepare_texts(features): return texts -def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict): +def add_vectors(collection_name: str, _id: int, features: dict, payload: dict): """Add vectors for a vacancy based on its features.""" texts = _prepare_texts(features) vectors = {} @@ -52,7 +56,7 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict): collection_name=collection_name, points=[ models.PointStruct( - id=vacancy_id, + id=_id, vector=vectors, payload=payload, ) @@ -60,20 +64,18 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict): ) -def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]: - texts = _prepare_texts(query_features) - vectors = {} - for name, text in texts.items(): - vectors[name] = [0.0] * 3072 - if text: - vec = embedding.embed_query(text) - vectors[name] = vec +def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]: + vectors = client.retrieve( + collection_name="cvs", + ids=[cv_id], + with_vectors=True, + )[0].vector max_similarities = {} for name, vec in vectors.items(): if any(v != 0 for v in vec): results = client.search( - collection_name=collection_name, + collection_name="vacancies", query_vector=(name, vec), limit=1000, with_payload=True, @@ -95,7 +97,7 @@ def search_similarities(query_features: dict, query_filter: Filter) -> list[dict return scored[0]["id"] -def extract_vacancy_features(content: str) -> VacancyFeatures: +def extract_features(content: str) -> VacancyFeatures: prompt = f""" Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. Features: