From 45e89be6d029c9cf7c1849e26681a88c0ffd8dbd Mon Sep 17 00:00:00 2001 From: estromenko Date: Sat, 8 Nov 2025 22:40:14 +0300 Subject: [PATCH] Implement simplified recommendations --- compose.yaml | 7 - vacancies/main/admin.py | 10 ++ vacancies/main/bot.py | 51 +++--- ...ollect_vacancies_from_telegram_messages.py | 61 ++++--- .../generate_recommended_vacancies.py | 12 +- vacancies/main/management/commands/runbot.py | 2 +- ...itle_customercv_max_salary_rub_and_more.py | 55 ++++++ vacancies/main/models.py | 41 +++-- vacancies/main/vector_store.py | 168 +----------------- 9 files changed, 163 insertions(+), 244 deletions(-) create mode 100644 vacancies/main/migrations/0009_jobtitle_customercv_max_salary_rub_and_more.py diff --git a/compose.yaml b/compose.yaml index d2e4ddc..ddec35f 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1,11 +1,4 @@ services: - qdrant: - image: qdrant/qdrant:latest - restart: always - ports: - - "127.0.0.1:6333:6333" - volumes: - - "/srv/vision-career/qdrant:/qdrant/storage" postgres: image: postgres:17-alpine3.20 restart: always diff --git a/vacancies/main/admin.py b/vacancies/main/admin.py index 1438c23..92a7e3f 100644 --- a/vacancies/main/admin.py +++ b/vacancies/main/admin.py @@ -14,3 +14,13 @@ class CustomerCVADMIN(admin.ModelAdmin): @admin.register(models.RecommendedVacancy) class RecommendedVacancyAdmin(admin.ModelAdmin): pass + + +@admin.register(models.Vacancy) +class VacancyAdmin(admin.ModelAdmin): + pass + + +@admin.register(models.JobTitle) +class JobTitleAdmin(admin.ModelAdmin): + pass diff --git a/vacancies/main/bot.py b/vacancies/main/bot.py index 2b33700..06fd87e 100644 --- a/vacancies/main/bot.py +++ b/vacancies/main/bot.py @@ -1,5 +1,4 @@ import io -import asyncio import os import traceback @@ -22,14 +21,11 @@ from telegram.ext import ( filters, ) +from pydantic import BaseModel +from typing import Literal from vacancies.conf.settings import DB_URI -from vacancies.main.models import Customer, CustomerCV -from vacancies.main.vector_store import ( - add_vectors, - batch_extract_features, - get_next_vacancy, - embed_features, -) +from vacancies.main.models import Customer, CustomerCV, JobTitle +from vacancies.main.vector_store import get_next_vacancy SYSTEM_PROMPT = """ Ты — карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры. @@ -69,19 +65,17 @@ async def next_vacancy(update: Update, context: ContextTypes.DEFAULT_TYPE): await context.bot.send_message(chat_id=update.effective_chat.id, text=message) return - result = get_next_vacancy(customer_cv) - if not result: + vacancy = get_next_vacancy(customer_cv) + if not vacancy: message = "Вакансии закончились, возвращайтесь позже!" await context.bot.send_message(chat_id=update.effective_chat.id, text=message) return - recommendation, vacancy_content, link = result - await context.bot.send_message( chat_id=update.effective_chat.id, - text=vacancy_content, + text=vacancy.content, reply_markup=InlineKeyboardMarkup([[ - InlineKeyboardButton("Откликнуться", url=link), + InlineKeyboardButton("Откликнуться", url=vacancy.link), ]]), ) @@ -123,23 +117,28 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE): reader = PdfReader(buffer) resume = "\n".join(page.extract_text() for page in reader.pages) + job_titles = JobTitle.objects.values_list('title', flat=True) + job_title_map = dict(JobTitle.objects.values_list('title', 'id')) + + class Structure(BaseModel): + job_title: Literal[tuple(job_titles)] + min_salary_rub: int | None + max_salary_rub: int | None + + openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1) + structured_llm = openai_client.with_structured_output(Structure) + + prompt = f"Extract fields from following CV: {resume}" + response = await structured_llm.ainvoke(prompt) + customer = await Customer.objects.aget(telegram_id=update.effective_user.id) customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict( content=resume, + job_title_id=job_title_map[response.job_title], + min_salary_rub=response.min_salary_rub, + max_salary_rub=response.max_salary_rub, )) - def upload_vectors(): - features = batch_extract_features([customer_cv.content])[0] - add_vectors( - "cvs", - customer_cv.id, - features.model_dump(), - {'content': customer_cv.content, 'features_json': features.model_dump()}, - embed_features(features.model_dump()), - ) - - await asyncio.to_thread(upload_vectors) - await context.bot.editMessageText("Отлично! Запомнил Ваше резюме.", update.effective_chat.id, message.id) diff --git a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py index fee8053..33b6f6b 100644 --- a/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py +++ b/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py @@ -1,18 +1,14 @@ -from concurrent.futures import ThreadPoolExecutor -from datetime import datetime, timedelta from itertools import batched +from datetime import timedelta +from django.utils import timezone +from pydantic import BaseModel +from typing import Literal +from vacancies.main.models import Vacancy, JobTitle +from langchain_openai import ChatOpenAI import clickhouse_connect from django.core.management import BaseCommand from django.conf import settings -from qdrant_client.models import OrderBy - -from vacancies.main.vector_store import ( - add_vectors, - batch_extract_features, - embed_features, - qdrant_client, -) query = """ SELECT DISTINCT ON (message) id, chat_username, telegram_id, message, timestamp @@ -38,23 +34,38 @@ class Command(BaseCommand): help = "Collect vacancies from telegram messages" def handle(self, *args, **options): - response = qdrant_client.scroll(collection_name="vacancies", limit=1, order_by=OrderBy(key="timestamp", direction="desc")) - last_point_timestamp = datetime.now() - timedelta(days=30) - if response[0]: - last_point_timestamp = response[0][0].payload["timestamp"] + job_titles = JobTitle.objects.values_list('title', flat=True) + job_title_map = dict(JobTitle.objects.values_list('title', 'id')) + + class Structure(BaseModel): + job_title: Literal[tuple(job_titles)] + min_salary_rub: int | None + max_salary_rub: int | None + + openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1) + structured_llm = openai_client.with_structured_output(Structure) + + last_timestamp = timezone.now() - timedelta(days=30) + if last_vacancy := Vacancy.objects.order_by("-timestamp").first(): + last_timestamp = last_vacancy.timestamp clickhouse_client = clickhouse_connect.create_client(host=settings.CLICKHOUSE_HOST, port=settings.CLICKHOUSE_PORT) - result_rows = clickhouse_client.query(query, parameters={"timestamp": last_point_timestamp}).result_rows + result_rows = clickhouse_client.query(query, parameters={"timestamp": last_timestamp}).result_rows for index, rows in enumerate(batched(result_rows, settings.COLLECT_VACANCIES_BATCH_SIZE)): - vacancies_features = batch_extract_features([row[3] for row in rows]) - - print(f"Processing {index+1}/{len(result_rows)//settings.COLLECT_VACANCIES_BATCH_SIZE}") - with ThreadPoolExecutor() as pool: - vacancies_vectors = pool.map(embed_features, [vacancy_features.model_dump() for vacancy_features in vacancies_features]) - - for row, vacancy_features, vacancy_vectors in zip(rows, vacancies_features, vacancies_vectors): + prompts = [f"Extract fields from following vacancies: {row[3]}" for row in rows] + responses = structured_llm.batch(prompts) + vacancies = [] + for row, response in zip(rows, responses): + print(response) (id, chat_username, telegram_id, message, timestamp) = row - link = f"https://t.me/{chat_username}/{telegram_id}" - payload = {'content': message, 'features_json': vacancy_features.model_dump(), "link": link, "timestamp": timestamp} - add_vectors("vacancies", id, vacancy_features.model_dump(), payload, vacancy_vectors) + vacancies.append(Vacancy( + external_id=id, + job_title_id=job_title_map[response.job_title], + min_salary_rub=response.min_salary_rub, + max_salary_rub=response.max_salary_rub, + content=message, + timestamp=timestamp, + link=f"https://t.me/{chat_username}/{telegram_id}", + )) + print(Vacancy.objects.bulk_create(vacancies, ignore_conflicts=True)) diff --git a/vacancies/main/management/commands/generate_recommended_vacancies.py b/vacancies/main/management/commands/generate_recommended_vacancies.py index 8c56ec1..b36f78d 100644 --- a/vacancies/main/management/commands/generate_recommended_vacancies.py +++ b/vacancies/main/management/commands/generate_recommended_vacancies.py @@ -15,16 +15,14 @@ class Command(BaseCommand): async def ahandle(self, *args, **options): for customer_cv in CustomerCV.objects.all(): - result = get_next_vacancy(customer_cv) - if not result: + vacancy = get_next_vacancy(customer_cv) + if not vacancy: continue - recommendation, vacancy_content, link = result - await application.bot.send_message( - chat_id=recommendation.customer.chat_id, - text=vacancy_content, + chat_id=customer_cv.customer.chat_id, + text=vacancy.content, reply_markup=InlineKeyboardMarkup([[ - InlineKeyboardButton("Откликнуться", url=link), + InlineKeyboardButton("Откликнуться", url=vacancy.link), ]]), ) diff --git a/vacancies/main/management/commands/runbot.py b/vacancies/main/management/commands/runbot.py index 8fc396a..8726943 100644 --- a/vacancies/main/management/commands/runbot.py +++ b/vacancies/main/management/commands/runbot.py @@ -14,6 +14,6 @@ class Command(BaseCommand): checkpointer.setup() if sys.platform == "win32": - asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) + asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) application.run_polling() diff --git a/vacancies/main/migrations/0009_jobtitle_customercv_max_salary_rub_and_more.py b/vacancies/main/migrations/0009_jobtitle_customercv_max_salary_rub_and_more.py new file mode 100644 index 0000000..61f03af --- /dev/null +++ b/vacancies/main/migrations/0009_jobtitle_customercv_max_salary_rub_and_more.py @@ -0,0 +1,55 @@ +# Generated by Django 5.2.7 on 2025-11-08 19:11 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0008_alter_recommendedvacancy_vacancy_id'), + ] + + operations = [ + migrations.CreateModel( + name='JobTitle', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.CharField(max_length=255, unique=True)), + ], + ), + migrations.AddField( + model_name='customercv', + name='max_salary_rub', + field=models.PositiveIntegerField(blank=True, default=None, null=True), + ), + migrations.AddField( + model_name='customercv', + name='min_salary_rub', + field=models.PositiveIntegerField(blank=True, default=None, null=True), + ), + migrations.AddField( + model_name='customercv', + name='job_title', + field=models.ForeignKey(default=0, on_delete=django.db.models.deletion.CASCADE, to='main.jobtitle'), + preserve_default=False, + ), + migrations.CreateModel( + name='Vacancy', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('external_id', models.CharField(max_length=255, unique=True)), + ('min_salary_rub', models.PositiveIntegerField(blank=True, default=None, null=True)), + ('max_salary_rub', models.PositiveIntegerField(blank=True, default=None, null=True)), + ('content', models.TextField()), + ('timestamp', models.DateTimeField()), + ('link', models.URLField()), + ('job_title', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='main.jobtitle')), + ], + ), + migrations.AlterField( + model_name='recommendedvacancy', + name='vacancy_id', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='main.vacancy'), + ), + ] diff --git a/vacancies/main/models.py b/vacancies/main/models.py index 0994d14..2550659 100644 --- a/vacancies/main/models.py +++ b/vacancies/main/models.py @@ -1,5 +1,4 @@ from django.db import models -from pydantic import BaseModel class Customer(models.Model): @@ -17,8 +16,18 @@ class Customer(models.Model): db_table = "customers" +class JobTitle(models.Model): + title = models.CharField(max_length=255, unique=True) + + def __str__(self): + return self.title + + class CustomerCV(models.Model): customer = models.OneToOneField(Customer, on_delete=models.CASCADE) + job_title = models.ForeignKey(JobTitle, on_delete=models.CASCADE) + min_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None) + max_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None) content = models.TextField() created_at = models.DateTimeField(auto_now_add=True) @@ -31,9 +40,21 @@ class CustomerCV(models.Model): db_table = "customer_vcs" +class Vacancy(models.Model): + job_title = models.ForeignKey(JobTitle, on_delete=models.CASCADE) + external_id = models.CharField(max_length=255, unique=True) + min_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None) + max_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None) + content = models.TextField() + timestamp = models.DateTimeField() + link = models.URLField() + + def __str__(self): + return self.job_title.title + class RecommendedVacancy(models.Model): customer = models.ForeignKey(Customer, on_delete=models.CASCADE) - vacancy_id = models.BigIntegerField() + vacancy_id = models.ForeignKey(Vacancy, on_delete=models.CASCADE) created_at = models.DateTimeField(auto_now_add=True) objects = models.Manager() @@ -44,19 +65,3 @@ class RecommendedVacancy(models.Model): class Meta: verbose_name_plural = 'Recommended Vacancies' db_table = "recommended_vacancies" - - -class VacancyFeatures(BaseModel): - job_title: str | None = None # Должность - employment_type: str | None = None # Тип занятости - work_format: str | None = None # Формат работы - experience: str | None = None # Опыт работы - position_level: str | None = None # Уровень позиции - industry: str | None = None # Отрасль / Сфера деятельности - tech_stack: list[str] | None = None # Технологический стек / Ключевые навыки - location: str | None = None # География - salary_range: str | None = None # Зарплатные ожидания / вилка - languages: list[str] | None = None # Языки - education: str | None = None # Образование - schedule: str | None = None # График работы - additional_requirements: list[str] | None = None # Дополнительные предпочтения / требования diff --git a/vacancies/main/vector_store.py b/vacancies/main/vector_store.py index cc096ce..7cf2d25 100644 --- a/vacancies/main/vector_store.py +++ b/vacancies/main/vector_store.py @@ -1,154 +1,4 @@ -from langchain_openai import ChatOpenAI, OpenAIEmbeddings -from qdrant_client import QdrantClient, models -from qdrant_client.models import Filter, HasIdCondition - -from vacancies.conf.settings import QDRANT_URL -from vacancies.main.models import RecommendedVacancy, VacancyFeatures - -qdrant_client = QdrantClient(url=QDRANT_URL) - -FEATURE_NAMES = [ - "job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack", - "location", "salary_range", "languages", "education", "schedule", "additional_requirements" -] - -weights = { - "job_title": 70, - "tech_stack": 10, - "salary_range": 10, -} - -vectors_config = { - name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES -} - -if not qdrant_client.collection_exists("vacancies"): - qdrant_client.create_collection( - collection_name="vacancies", - vectors_config=vectors_config, - ) - qdrant_client.create_payload_index( - collection_name="vacancies", - field_name="timestamp", - field_schema="datetime", - ) -if not qdrant_client.collection_exists("cvs"): - qdrant_client.create_collection( - collection_name="cvs", - vectors_config=vectors_config, - ) - -embedding = OpenAIEmbeddings(model="text-embedding-3-large") - -def _prepare_texts(features): - texts = {} - for name in FEATURE_NAMES: - value = features.get(name) - if isinstance(value, list): - text = " ".join(value) if value else "" - else: - text = str(value) if value else "" - texts[name] = text - return texts - - -def embed_features(features): - features = {key: value for key, value in features.items() if value} - features_texts = _prepare_texts(features) - names, texts = features_texts.keys(), features_texts.values() - vectors = dict(zip(names, embedding.embed_documents(texts))) - return vectors - - -def add_vectors(collection_name: str, _id: int, features: dict, payload: dict, vectors): - max_similarities = {} - for name, vec in vectors.items(): - results = qdrant_client.query_points(collection_name="vacancies", query=vec, using=name, limit=100) - for res in results.points: - max_similarities.setdefault(res.id, {}) - max_similarities[res.id][name] = res.score - - scored = [] - for vid, feature_sims in max_similarities.items(): - total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims) - scored.append({"id": vid, "score": total}) - - scored.sort(key=lambda x: x["score"], reverse=True) - if scored and scored[0]["score"] > 80: # threshold - return - - qdrant_client.upsert( - collection_name=collection_name, - points=[models.PointStruct(id=_id, vector=vectors, payload=payload)] - ) - - -def search_similarities(query_filter: Filter, cv_id: int): - cv = qdrant_client.retrieve(collection_name="cvs", ids=[cv_id], with_vectors=True)[0] - - max_similarities, vacancies_content = {}, {} - for name, vec in cv.vector.items(): - results = qdrant_client.query_points( - collection_name="vacancies", - query=vec, - using=name, - limit=100000, - with_payload=True, - query_filter=query_filter, - ) - for res in results.points: - max_similarities.setdefault(res.id, {}) - vacancies_content.setdefault(res.id, {}) - - max_similarities[res.id][name] = res.score - vacancies_content[res.id]["content"] = res.payload["content"] - vacancies_content[res.id]["features_json"] = res.payload["features_json"] - vacancies_content[res.id]["link"] = res.payload["link"] - - scored = [] - for vid, feature_sims in max_similarities.items(): - total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims) - scored.append({ - "id": vid, - "score": total, - "content": vacancies_content[vid]["content"], - "features_json": vacancies_content[vid]["features_json"], - "link": vacancies_content[vid]["link"], - "sims": feature_sims, - }) - - scored.sort(key=lambda x: x["score"], reverse=True) - - return scored[0]["id"], scored[0]["content"], scored[0]["link"] - - -def batch_extract_features(contents: list[str]) -> list[VacancyFeatures]: - prompts = [ - f""" - Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. - Features: - - job_title: Должность (e.g., DevOps, Python программист) - - employment_type: Тип занятости (e.g., Полная занятость, Частичная) - - work_format: Формат работы (e.g., Офис, Удалённо, Гибрид) - - experience: Опыт работы (e.g., 3-5 лет, Нет опыта) - - position_level: Уровень позиции (e.g., Junior, Senior) - - industry: Отрасль / Сфера деятельности (e.g., IT, Финансы) - - tech_stack: Технологический стек / Ключевые навыки (list of strings) - - location: География (e.g., Москва, Россия) - - salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб) - - languages: Языки (list of strings, e.g., ["Русский", "Английский"]) - - education: Образование (e.g., Высшее, Среднее специальное) - - schedule: График работы (e.g., Полный день, Сменный) - - additional_requirements: Дополнительные предпочтения / требования (list of strings) - Vacancy content: - {content} - """ - for content in contents - ] - openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1) - structured_llm = openai_client.with_structured_output(VacancyFeatures) - response = structured_llm.batch(prompts) - return response +from vacancies.main.models import RecommendedVacancy, Vacancy def get_next_vacancy(customer_cv): @@ -156,16 +6,14 @@ def get_next_vacancy(customer_cv): customer=customer_cv.customer, ).values_list('vacancy_id', flat=True) - query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)]) - result = search_similarities(query_filter, customer_cv.id) - if not result: - return None + vacancy = Vacancy.objects.exclude(id__in=recommended_vacancy_ids).filter( + job_title=customer_cv.job_title, + min_salary_rub__gt=customer_cv.min_salary_rub, + ).first() - search_result_id, vacancy_content, link = result - - recommendation = RecommendedVacancy.objects.create( + RecommendedVacancy.objects.create( customer=customer_cv.customer, - vacancy_id=search_result_id, + vacancy=vacancy, ) - return recommendation, vacancy_content, link + return vacancy