Add CV vector store

This commit is contained in:
estromenko 2025-10-26 19:48:53 +03:00
parent fb95e6e799
commit 42395de6d3
6 changed files with 66 additions and 28 deletions

View File

@ -8,6 +8,7 @@ from vacancies.main.models import Customer, CustomerCV
from langchain.agents import create_agent from langchain.agents import create_agent
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import InMemorySaver from langgraph.checkpoint.memory import InMemorySaver
from vacancies.main.vector_store import add_vectors, extract_features
SYSTEM_PROMPT = """ SYSTEM_PROMPT = """
Ты карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры. Ты карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры.
@ -75,6 +76,13 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE):
customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict( customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict(
content=resume, content=resume,
)) ))
features = extract_features(customer_cv.content)
add_vectors(
"cvs",
customer_cv.id,
features.model_dump(),
{'content': customer_cv.content, 'features_json': features.model_dump()},
)
await context.bot.send_message(chat_id=update.effective_chat.id, text="Отлично! Запомнил Ваше резюме.") await context.bot.send_message(chat_id=update.effective_chat.id, text="Отлично! Запомнил Ваше резюме.")

View File

@ -1,7 +1,7 @@
from django.core.management import BaseCommand from django.core.management import BaseCommand
from vacancies.main.models import Vacancy from vacancies.main.models import Vacancy
import clickhouse_connect import clickhouse_connect
from vacancies.main.vector_store import add_vacancy_vectors, extract_vacancy_features from vacancies.main.vector_store import add_vectors, extract_features
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123) clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
@ -24,15 +24,28 @@ class Command(BaseCommand):
help = "Collect vacancies from telegram messages" help = "Collect vacancies from telegram messages"
def handle(self, *args, **options): def handle(self, *args, **options):
for index, row in enumerate(clickhouse_client.query(query).result_rows): import time
start_time = time.time()
result_rows = clickhouse_client.query(query).result_rows
print(f"query time: {time.time() - start_time:.4f}")
result_rows_len = len(result_rows)
for index, row in enumerate(result_rows):
(id, chat_username, telegram_id, message, timestamp) = row (id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/{chat_username}/{telegram_id}" link = f"https://t.me/{chat_username}/{telegram_id}"
features = extract_vacancy_features(message) print(f"Processing {index}/{result_rows_len} link: {link}")
start_time = time.time()
features = extract_features(message)
print(f"ai time: {time.time() - start_time:.4f}")
vacancy, created = Vacancy.objects.get_or_create( vacancy, created = Vacancy.objects.get_or_create(
link=link, link=link,
defaults={'content': message, 'features_json': features.model_dump()}
) )
print(index, link) start_time = time.time()
add_vacancy_vectors(vacancy.id, features.model_dump(), {"link": link}) add_vectors(
"vacancies",
vacancy.id,
features.model_dump(),
{'content': message, 'features_json': features.model_dump()},
)
print(f"write vector time: {time.time() - start_time:.4f}")

View File

@ -1,8 +1,8 @@
import asyncio
from django.core.management import BaseCommand from django.core.management import BaseCommand
from vacancies.main.vector_store import search_similarities from vacancies.main.vector_store import search_similarities
from vacancies.main.models import CustomerCV, RecommendedVacancy from vacancies.main.models import CustomerCV, RecommendedVacancy
from vacancies.main.bot import application from vacancies.main.bot import application
from vacancies.main.features_extractor import extract_vacancy_features
from telegram import InlineKeyboardButton, InlineKeyboardMarkup from telegram import InlineKeyboardButton, InlineKeyboardMarkup
from qdrant_client.models import Filter, HasIdCondition from qdrant_client.models import Filter, HasIdCondition
@ -14,23 +14,22 @@ class Command(BaseCommand):
customer_cvs = CustomerCV.objects.all() customer_cvs = CustomerCV.objects.all()
for customer_cv in customer_cvs: for customer_cv in customer_cvs:
features = extract_vacancy_features(customer_cv.content)
recommended_vacancy_ids = RecommendedVacancy.objects.filter( recommended_vacancy_ids = RecommendedVacancy.objects.filter(
customer=customer_cv.customer, customer=customer_cv.customer,
).values_list('vacancy_id', flat=True) ).values_list('vacancy_id', flat=True)
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)]) query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
search_result_id = search_similarities(features.model_dump(), query_filter) search_result_id = search_similarities(query_filter, customer_cv.id)
recommendation = RecommendedVacancy.objects.create( recommendation = RecommendedVacancy.objects.create(
customer=customer_cv.customer, customer=customer_cv.customer,
vacancy_id=search_result_id, vacancy_id=search_result_id,
) )
application.bot.send_message( asyncio.run(application.bot.send_message(
chat_id=recommendation.customer.chat_id, chat_id=recommendation.customer.chat_id,
text=recommendation.vacancy.content, text=recommendation.vacancy.content,
reply_markup=InlineKeyboardMarkup([[ reply_markup=InlineKeyboardMarkup([[
InlineKeyboardButton("Откликнуться", url=recommendation.vacancy.link), InlineKeyboardButton("Откликнуться", url=recommendation.vacancy.link),
]]), ]]),
) ))

View File

@ -0,0 +1,17 @@
# Generated by Django 5.2.7 on 2025-10-26 16:14
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('main', '0005_remove_vacancy_is_processed'),
]
operations = [
migrations.RemoveField(
model_name='vacancy',
name='features_json',
),
]

View File

@ -34,7 +34,6 @@ class CustomerCV(models.Model):
class Vacancy(models.Model): class Vacancy(models.Model):
content = models.TextField() content = models.TextField()
link = models.URLField(unique=True) link = models.URLField(unique=True)
features_json = models.JSONField(null=True, blank=True)
created_at = models.DateTimeField(auto_now_add=True) created_at = models.DateTimeField(auto_now_add=True)
objects = models.Manager() objects = models.Manager()

View File

@ -17,10 +17,14 @@ vectors_config = {
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
} }
collection_name = "vacancies" if not client.collection_exists("vacancies"):
if not client.collection_exists(collection_name):
client.create_collection( client.create_collection(
collection_name=collection_name, collection_name="vacancies",
vectors_config=vectors_config
)
if not client.collection_exists("cvs"):
client.create_collection(
collection_name="cvs",
vectors_config=vectors_config vectors_config=vectors_config
) )
@ -39,7 +43,7 @@ def _prepare_texts(features):
return texts return texts
def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict): def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
"""Add vectors for a vacancy based on its features.""" """Add vectors for a vacancy based on its features."""
texts = _prepare_texts(features) texts = _prepare_texts(features)
vectors = {} vectors = {}
@ -52,7 +56,7 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
collection_name=collection_name, collection_name=collection_name,
points=[ points=[
models.PointStruct( models.PointStruct(
id=vacancy_id, id=_id,
vector=vectors, vector=vectors,
payload=payload, payload=payload,
) )
@ -60,20 +64,18 @@ def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
) )
def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]: def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
texts = _prepare_texts(query_features) vectors = client.retrieve(
vectors = {} collection_name="cvs",
for name, text in texts.items(): ids=[cv_id],
vectors[name] = [0.0] * 3072 with_vectors=True,
if text: )[0].vector
vec = embedding.embed_query(text)
vectors[name] = vec
max_similarities = {} max_similarities = {}
for name, vec in vectors.items(): for name, vec in vectors.items():
if any(v != 0 for v in vec): if any(v != 0 for v in vec):
results = client.search( results = client.search(
collection_name=collection_name, collection_name="vacancies",
query_vector=(name, vec), query_vector=(name, vec),
limit=1000, limit=1000,
with_payload=True, with_payload=True,
@ -95,7 +97,7 @@ def search_similarities(query_features: dict, query_filter: Filter) -> list[dict
return scored[0]["id"] return scored[0]["id"]
def extract_vacancy_features(content: str) -> VacancyFeatures: def extract_features(content: str) -> VacancyFeatures:
prompt = f""" prompt = f"""
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
Features: Features: