Compare commits

..

2 Commits

Author SHA1 Message Date
45e89be6d0 Implement simplified recommendations 2025-11-08 22:40:14 +03:00
b31ef06ec0 Update readme 2025-11-08 19:19:26 +03:00
10 changed files with 182 additions and 246 deletions

View File

@ -1,24 +1,41 @@
# vision-career-backend
# vision-career
Sample `.env`:
```dotenv
DEEPINFRA_API_TOKEN=your-token-here
OPENAI_API_KEY=your-token-here
OPENAI_PROXY=http://user:password@host:port
BOT_TOKEN=your-token-here
SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
SECRET_KEY=secret
DEBUG=true
```
Commands:
```bash
docker compose up -d
KUBECONFIG=clickhouse-kubeconfig.yaml kubectl port-forward svc/clickhouse-clickhouse -n clickhouse 18123:8123
uv sync
uv run --env-file .env manage.py migrate
uv run --env-file .env manage.py createsuperuser --username stromenko_es --email estromenko@mail.ru
uv run --env-file .env manage.py collectstatic
uv run --env-file .env manage.py runserver
uv run --env-file .env manage.py generate_recommended_vacancies
uv run --env-file .env manage.py collect_vacancies_from_telegram_messages
uv run --env-file .env manage.py runbot
```
Production port-forwards:
```bash
KUBECONFIG=production-kubeconfig.yaml kubectl port-forward svc/qdrant -n qdrant 6333:6333
KUBECONFIG=production-kubeconfig.yaml kubectl port-forward svc/main-cluster-rw -n postgresql-cluster 5432
```

View File

@ -1,11 +1,4 @@
services:
qdrant:
image: qdrant/qdrant:latest
restart: always
ports:
- "127.0.0.1:6333:6333"
volumes:
- "/srv/vision-career/qdrant:/qdrant/storage"
postgres:
image: postgres:17-alpine3.20
restart: always

View File

@ -14,3 +14,13 @@ class CustomerCVADMIN(admin.ModelAdmin):
@admin.register(models.RecommendedVacancy)
class RecommendedVacancyAdmin(admin.ModelAdmin):
pass
@admin.register(models.Vacancy)
class VacancyAdmin(admin.ModelAdmin):
pass
@admin.register(models.JobTitle)
class JobTitleAdmin(admin.ModelAdmin):
pass

View File

@ -1,5 +1,4 @@
import io
import asyncio
import os
import traceback
@ -22,14 +21,11 @@ from telegram.ext import (
filters,
)
from pydantic import BaseModel
from typing import Literal
from vacancies.conf.settings import DB_URI
from vacancies.main.models import Customer, CustomerCV
from vacancies.main.vector_store import (
add_vectors,
batch_extract_features,
get_next_vacancy,
embed_features,
)
from vacancies.main.models import Customer, CustomerCV, JobTitle
from vacancies.main.vector_store import get_next_vacancy
SYSTEM_PROMPT = """
Ты карьерный копилот для ИТ. Ты можешь отвечать на любые вопросы по тематике карьеры.
@ -69,19 +65,17 @@ async def next_vacancy(update: Update, context: ContextTypes.DEFAULT_TYPE):
await context.bot.send_message(chat_id=update.effective_chat.id, text=message)
return
result = get_next_vacancy(customer_cv)
if not result:
vacancy = get_next_vacancy(customer_cv)
if not vacancy:
message = "Вакансии закончились, возвращайтесь позже!"
await context.bot.send_message(chat_id=update.effective_chat.id, text=message)
return
recommendation, vacancy_content, link = result
await context.bot.send_message(
chat_id=update.effective_chat.id,
text=vacancy_content,
text=vacancy.content,
reply_markup=InlineKeyboardMarkup([[
InlineKeyboardButton("Откликнуться", url=link),
InlineKeyboardButton("Откликнуться", url=vacancy.link),
]]),
)
@ -123,23 +117,28 @@ async def handle_document(update: Update, context: ContextTypes.DEFAULT_TYPE):
reader = PdfReader(buffer)
resume = "\n".join(page.extract_text() for page in reader.pages)
job_titles = JobTitle.objects.values_list('title', flat=True)
job_title_map = dict(JobTitle.objects.values_list('title', 'id'))
class Structure(BaseModel):
job_title: Literal[tuple(job_titles)]
min_salary_rub: int | None
max_salary_rub: int | None
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
structured_llm = openai_client.with_structured_output(Structure)
prompt = f"Extract fields from following CV: {resume}"
response = await structured_llm.ainvoke(prompt)
customer = await Customer.objects.aget(telegram_id=update.effective_user.id)
customer_cv, _ = await CustomerCV.objects.aupdate_or_create(customer=customer, defaults=dict(
content=resume,
job_title_id=job_title_map[response.job_title],
min_salary_rub=response.min_salary_rub,
max_salary_rub=response.max_salary_rub,
))
def upload_vectors():
features = batch_extract_features([customer_cv.content])[0]
add_vectors(
"cvs",
customer_cv.id,
features.model_dump(),
{'content': customer_cv.content, 'features_json': features.model_dump()},
embed_features(features.model_dump()),
)
await asyncio.to_thread(upload_vectors)
await context.bot.editMessageText("Отлично! Запомнил Ваше резюме.", update.effective_chat.id, message.id)

View File

@ -1,18 +1,14 @@
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from itertools import batched
from datetime import timedelta
from django.utils import timezone
from pydantic import BaseModel
from typing import Literal
from vacancies.main.models import Vacancy, JobTitle
from langchain_openai import ChatOpenAI
import clickhouse_connect
from django.core.management import BaseCommand
from django.conf import settings
from qdrant_client.models import OrderBy
from vacancies.main.vector_store import (
add_vectors,
batch_extract_features,
embed_features,
qdrant_client,
)
query = """
SELECT DISTINCT ON (message) id, chat_username, telegram_id, message, timestamp
@ -38,23 +34,38 @@ class Command(BaseCommand):
help = "Collect vacancies from telegram messages"
def handle(self, *args, **options):
response = qdrant_client.scroll(collection_name="vacancies", limit=1, order_by=OrderBy(key="timestamp", direction="desc"))
last_point_timestamp = datetime.now() - timedelta(days=30)
if response[0]:
last_point_timestamp = response[0][0].payload["timestamp"]
job_titles = JobTitle.objects.values_list('title', flat=True)
job_title_map = dict(JobTitle.objects.values_list('title', 'id'))
class Structure(BaseModel):
job_title: Literal[tuple(job_titles)]
min_salary_rub: int | None
max_salary_rub: int | None
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
structured_llm = openai_client.with_structured_output(Structure)
last_timestamp = timezone.now() - timedelta(days=30)
if last_vacancy := Vacancy.objects.order_by("-timestamp").first():
last_timestamp = last_vacancy.timestamp
clickhouse_client = clickhouse_connect.create_client(host=settings.CLICKHOUSE_HOST, port=settings.CLICKHOUSE_PORT)
result_rows = clickhouse_client.query(query, parameters={"timestamp": last_point_timestamp}).result_rows
result_rows = clickhouse_client.query(query, parameters={"timestamp": last_timestamp}).result_rows
for index, rows in enumerate(batched(result_rows, settings.COLLECT_VACANCIES_BATCH_SIZE)):
vacancies_features = batch_extract_features([row[3] for row in rows])
print(f"Processing {index+1}/{len(result_rows)//settings.COLLECT_VACANCIES_BATCH_SIZE}")
with ThreadPoolExecutor() as pool:
vacancies_vectors = pool.map(embed_features, [vacancy_features.model_dump() for vacancy_features in vacancies_features])
for row, vacancy_features, vacancy_vectors in zip(rows, vacancies_features, vacancies_vectors):
prompts = [f"Extract fields from following vacancies: {row[3]}" for row in rows]
responses = structured_llm.batch(prompts)
vacancies = []
for row, response in zip(rows, responses):
print(response)
(id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/{chat_username}/{telegram_id}"
payload = {'content': message, 'features_json': vacancy_features.model_dump(), "link": link, "timestamp": timestamp}
add_vectors("vacancies", id, vacancy_features.model_dump(), payload, vacancy_vectors)
vacancies.append(Vacancy(
external_id=id,
job_title_id=job_title_map[response.job_title],
min_salary_rub=response.min_salary_rub,
max_salary_rub=response.max_salary_rub,
content=message,
timestamp=timestamp,
link=f"https://t.me/{chat_username}/{telegram_id}",
))
print(Vacancy.objects.bulk_create(vacancies, ignore_conflicts=True))

View File

@ -15,16 +15,14 @@ class Command(BaseCommand):
async def ahandle(self, *args, **options):
for customer_cv in CustomerCV.objects.all():
result = get_next_vacancy(customer_cv)
if not result:
vacancy = get_next_vacancy(customer_cv)
if not vacancy:
continue
recommendation, vacancy_content, link = result
await application.bot.send_message(
chat_id=recommendation.customer.chat_id,
text=vacancy_content,
chat_id=customer_cv.customer.chat_id,
text=vacancy.content,
reply_markup=InlineKeyboardMarkup([[
InlineKeyboardButton("Откликнуться", url=link),
InlineKeyboardButton("Откликнуться", url=vacancy.link),
]]),
)

View File

@ -14,6 +14,6 @@ class Command(BaseCommand):
checkpointer.setup()
if sys.platform == "win32":
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
application.run_polling()

View File

@ -0,0 +1,55 @@
# Generated by Django 5.2.7 on 2025-11-08 19:11
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('main', '0008_alter_recommendedvacancy_vacancy_id'),
]
operations = [
migrations.CreateModel(
name='JobTitle',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('title', models.CharField(max_length=255, unique=True)),
],
),
migrations.AddField(
model_name='customercv',
name='max_salary_rub',
field=models.PositiveIntegerField(blank=True, default=None, null=True),
),
migrations.AddField(
model_name='customercv',
name='min_salary_rub',
field=models.PositiveIntegerField(blank=True, default=None, null=True),
),
migrations.AddField(
model_name='customercv',
name='job_title',
field=models.ForeignKey(default=0, on_delete=django.db.models.deletion.CASCADE, to='main.jobtitle'),
preserve_default=False,
),
migrations.CreateModel(
name='Vacancy',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('external_id', models.CharField(max_length=255, unique=True)),
('min_salary_rub', models.PositiveIntegerField(blank=True, default=None, null=True)),
('max_salary_rub', models.PositiveIntegerField(blank=True, default=None, null=True)),
('content', models.TextField()),
('timestamp', models.DateTimeField()),
('link', models.URLField()),
('job_title', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='main.jobtitle')),
],
),
migrations.AlterField(
model_name='recommendedvacancy',
name='vacancy_id',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='main.vacancy'),
),
]

View File

@ -1,5 +1,4 @@
from django.db import models
from pydantic import BaseModel
class Customer(models.Model):
@ -17,8 +16,18 @@ class Customer(models.Model):
db_table = "customers"
class JobTitle(models.Model):
title = models.CharField(max_length=255, unique=True)
def __str__(self):
return self.title
class CustomerCV(models.Model):
customer = models.OneToOneField(Customer, on_delete=models.CASCADE)
job_title = models.ForeignKey(JobTitle, on_delete=models.CASCADE)
min_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None)
max_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None)
content = models.TextField()
created_at = models.DateTimeField(auto_now_add=True)
@ -31,9 +40,21 @@ class CustomerCV(models.Model):
db_table = "customer_vcs"
class Vacancy(models.Model):
job_title = models.ForeignKey(JobTitle, on_delete=models.CASCADE)
external_id = models.CharField(max_length=255, unique=True)
min_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None)
max_salary_rub = models.PositiveIntegerField(null=True, blank=True, default=None)
content = models.TextField()
timestamp = models.DateTimeField()
link = models.URLField()
def __str__(self):
return self.job_title.title
class RecommendedVacancy(models.Model):
customer = models.ForeignKey(Customer, on_delete=models.CASCADE)
vacancy_id = models.BigIntegerField()
vacancy_id = models.ForeignKey(Vacancy, on_delete=models.CASCADE)
created_at = models.DateTimeField(auto_now_add=True)
objects = models.Manager()
@ -44,19 +65,3 @@ class RecommendedVacancy(models.Model):
class Meta:
verbose_name_plural = 'Recommended Vacancies'
db_table = "recommended_vacancies"
class VacancyFeatures(BaseModel):
job_title: str | None = None # Должность
employment_type: str | None = None # Тип занятости
work_format: str | None = None # Формат работы
experience: str | None = None # Опыт работы
position_level: str | None = None # Уровень позиции
industry: str | None = None # Отрасль / Сфера деятельности
tech_stack: list[str] | None = None # Технологический стек / Ключевые навыки
location: str | None = None # География
salary_range: str | None = None # Зарплатные ожидания / вилка
languages: list[str] | None = None # Языки
education: str | None = None # Образование
schedule: str | None = None # График работы
additional_requirements: list[str] | None = None # Дополнительные предпочтения / требования

View File

@ -1,154 +1,4 @@
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from qdrant_client import QdrantClient, models
from qdrant_client.models import Filter, HasIdCondition
from vacancies.conf.settings import QDRANT_URL
from vacancies.main.models import RecommendedVacancy, VacancyFeatures
qdrant_client = QdrantClient(url=QDRANT_URL)
FEATURE_NAMES = [
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
]
weights = {
"job_title": 70,
"tech_stack": 10,
"salary_range": 10,
}
vectors_config = {
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
}
if not qdrant_client.collection_exists("vacancies"):
qdrant_client.create_collection(
collection_name="vacancies",
vectors_config=vectors_config,
)
qdrant_client.create_payload_index(
collection_name="vacancies",
field_name="timestamp",
field_schema="datetime",
)
if not qdrant_client.collection_exists("cvs"):
qdrant_client.create_collection(
collection_name="cvs",
vectors_config=vectors_config,
)
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
def _prepare_texts(features):
texts = {}
for name in FEATURE_NAMES:
value = features.get(name)
if isinstance(value, list):
text = " ".join(value) if value else ""
else:
text = str(value) if value else ""
texts[name] = text
return texts
def embed_features(features):
features = {key: value for key, value in features.items() if value}
features_texts = _prepare_texts(features)
names, texts = features_texts.keys(), features_texts.values()
vectors = dict(zip(names, embedding.embed_documents(texts)))
return vectors
def add_vectors(collection_name: str, _id: int, features: dict, payload: dict, vectors):
max_similarities = {}
for name, vec in vectors.items():
results = qdrant_client.query_points(collection_name="vacancies", query=vec, using=name, limit=100)
for res in results.points:
max_similarities.setdefault(res.id, {})
max_similarities[res.id][name] = res.score
scored = []
for vid, feature_sims in max_similarities.items():
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
scored.append({"id": vid, "score": total})
scored.sort(key=lambda x: x["score"], reverse=True)
if scored and scored[0]["score"] > 80: # threshold
return
qdrant_client.upsert(
collection_name=collection_name,
points=[models.PointStruct(id=_id, vector=vectors, payload=payload)]
)
def search_similarities(query_filter: Filter, cv_id: int):
cv = qdrant_client.retrieve(collection_name="cvs", ids=[cv_id], with_vectors=True)[0]
max_similarities, vacancies_content = {}, {}
for name, vec in cv.vector.items():
results = qdrant_client.query_points(
collection_name="vacancies",
query=vec,
using=name,
limit=100000,
with_payload=True,
query_filter=query_filter,
)
for res in results.points:
max_similarities.setdefault(res.id, {})
vacancies_content.setdefault(res.id, {})
max_similarities[res.id][name] = res.score
vacancies_content[res.id]["content"] = res.payload["content"]
vacancies_content[res.id]["features_json"] = res.payload["features_json"]
vacancies_content[res.id]["link"] = res.payload["link"]
scored = []
for vid, feature_sims in max_similarities.items():
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
scored.append({
"id": vid,
"score": total,
"content": vacancies_content[vid]["content"],
"features_json": vacancies_content[vid]["features_json"],
"link": vacancies_content[vid]["link"],
"sims": feature_sims,
})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[0]["id"], scored[0]["content"], scored[0]["link"]
def batch_extract_features(contents: list[str]) -> list[VacancyFeatures]:
prompts = [
f"""
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
Features:
- job_title: Должность (e.g., DevOps, Python программист)
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
- position_level: Уровень позиции (e.g., Junior, Senior)
- industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
- tech_stack: Технологический стек / Ключевые навыки (list of strings)
- location: География (e.g., Москва, Россия)
- salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
- languages: Языки (list of strings, e.g., ["Русский", "Английский"])
- education: Образование (e.g., Высшее, Среднее специальное)
- schedule: График работы (e.g., Полный день, Сменный)
- additional_requirements: Дополнительные предпочтения / требования (list of strings)
Vacancy content:
{content}
"""
for content in contents
]
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
structured_llm = openai_client.with_structured_output(VacancyFeatures)
response = structured_llm.batch(prompts)
return response
from vacancies.main.models import RecommendedVacancy, Vacancy
def get_next_vacancy(customer_cv):
@ -156,16 +6,14 @@ def get_next_vacancy(customer_cv):
customer=customer_cv.customer,
).values_list('vacancy_id', flat=True)
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
result = search_similarities(query_filter, customer_cv.id)
if not result:
return None
vacancy = Vacancy.objects.exclude(id__in=recommended_vacancy_ids).filter(
job_title=customer_cv.job_title,
min_salary_rub__gt=customer_cv.min_salary_rub,
).first()
search_result_id, vacancy_content, link = result
recommendation = RecommendedVacancy.objects.create(
RecommendedVacancy.objects.create(
customer=customer_cv.customer,
vacancy_id=search_result_id,
vacancy=vacancy,
)
return recommendation, vacancy_content, link
return vacancy