Ссылка на пост в LinkedIn
Вы когда-нибудь задумывались о том, как можно использовать Python, Data Science, SQL и многое другое для улучшения своих навыков и карьеры? Если да, то канал "Заскуль питона (Data Science)" (@zasql_python) - идеальное место для вас! Этот канал предлагает широкий спектр информации о Python, Data Science, SQL и других темах, которые помогут вам расширить свои знания и навыки в области информационных технологий
Канал "Заскуль питона (Data Science)" является источником полезной информации и рекомендаций от опытных специалистов в области IT. Здесь вы найдете статьи, советы, обзоры инструментов и многое другое, что поможет вам стать успешным специалистом в области Data Science и Python
Если у вас есть вопросы по поводу сотрудничества или рекламы, вы всегда можете обратиться к администратору канала по имени @m459n9. Также в канале есть ссылка на чат, где вы сможете общаться с единомышленниками и задавать свои вопросы. Присоединяйтесь к каналу "Заскуль питона (Data Science)" прямо сейчас и начните свой путь к успеху в мире информационных технологий!
18 Feb, 16:50
17 Feb, 07:04
14 Feb, 17:10
10 Feb, 16:00
10 Feb, 06:57
04 Feb, 16:55
02 Feb, 12:05
WITH user_activity AS (
SELECT
user_id,
event_date,
DATE_TRUNC('week', event_date) AS week_start,
DATE_TRUNC('month', event_date) AS month_start
FROM user_events
WHERE event_date BETWEEN '2024-01-01' AND '2024-12-31'
)
SELECT
event_date,
COUNT(DISTINCT user_id) AS DAU,
COUNT(DISTINCT CASE WHEN event_date = week_start THEN user_id END) AS WAU,
COUNT(DISTINCT CASE WHEN event_date = month_start THEN user_id END) AS MAU
FROM user_activity
GROUP BY event_date
ORDER BY event_date;
WITH daily_users AS (
SELECT
event_date,
user_id
FROM user_events
WHERE event_date BETWEEN '2024-01-01' AND '2024-01-30'
),
date_series AS (
SELECT DISTINCT event_date
FROM daily_users
),
cumulative_users AS (
SELECT
d.event_date,
COUNT(DISTINCT u.user_id) AS cumulative_unique_users
FROM date_series d
LEFT JOIN daily_users u ON u.event_date <= d.event_date
GROUP BY d.event_date
ORDER BY d.event_date
)
SELECT * FROM cumulative_users;
WITH daily_feature_users AS (
SELECT
event_date,
user_id
FROM user_events
WHERE event_name = 'feature_x'
AND event_date BETWEEN '2024-01-01' AND '2024-01-30'
),
daily_total_users AS (
SELECT
event_date,
user_id
FROM user_events
WHERE event_date BETWEEN '2024-01-01' AND '2024-01-30'
),
date_series AS (
SELECT DISTINCT event_date
FROM daily_total_users
),
cumulative_feature_users AS (
SELECT
d.event_date,
COUNT(DISTINCT u.user_id) AS cumulative_feature_users
FROM date_series d
LEFT JOIN daily_feature_users u ON u.event_date <= d.event_date
GROUP BY d.event_date
ORDER BY d.event_date
),
cumulative_total_users AS (
SELECT
d.event_date,
COUNT(DISTINCT u.user_id) AS cumulative_total_users
FROM date_series d
LEFT JOIN daily_total_users u ON u.event_date <= d.event_date
GROUP BY d.event_date
ORDER BY d.event_date
)
SELECT
cfu.event_date,
cfu.cumulative_feature_users,
ctu.cumulative_total_users,
ROUND(100.0 * cfu.cumulative_feature_users / (ctu.cumulative_total_users, 0), 2) AS penetration_rate
FROM cumulative_feature_users cfu
JOIN cumulative_total_users ctu ON cfu.event_date = ctu.event_date
ORDER BY cfu.event_date;
27 Jan, 11:06
def bootstrap_ratio(data, nominator, denominator, group_column, group_value, n_iter=10000):
group_data = data[data[group_column] == group_value]
boot_ratios = []
for _ in range(n_iter):
sample = group_data.sample(len(group_data), replace=True)
ratio = sample[nominator].sum() / sample[denominator].sum()
boot_ratios.append(ratio)
return np.array(boot_ratios)
def bucketize(data, nominator, denominator, n_buckets=50, random_state=42):
data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
buckets = np.array_split(data, n_buckets)
bucket_ratios = [bucket[nominator].sum() / bucket[denominator].sum() for bucket in buckets]
return bucket_ratios
def calculate_ratio_variance(values_numerator, values_denominator):
mean_num = np.mean(values_numerator)
mean_denom = np.mean(values_denominator)
variance_num = np.var(values_numerator, ddof=1)
variance_denom = np.var(values_denominator, ddof=1)
covariance_num_denom = np.cov(values_numerator, values_denominator)[0, 1]
ratio_variance = (
(variance_num / mean_denom ** 2)
- (2 * (mean_num / mean_denom ** 3) * covariance_num_denom)
+ ((mean_num ** 2 / mean_denom ** 4) * variance_denom)
)
return ratio_variance
# ratio_control - ratio-метрика в контрольной группе (для теста также рассчитывается)
def calculate_ratio_control(numerator_control, denominator_control):
return sum(numerator_control) / sum(denominator_control)
ratio_control = calculate_ratio_control(numerator_control, denominator_control)
def linearization(numerator, denominator, ratio_control):
return numerator - ratio_control * denominator
25 Jan, 11:44
19 Jan, 11:09
13 Jan, 10:02
06 Jan, 17:21
30 Dec, 13:00
27 Dec, 15:08
21 Dec, 18:27
17 Dec, 05:38
16 Dec, 07:01
15 Dec, 11:20
15 Dec, 04:37
with cohort_base as (
select
u.user_id,
date(u.registration_date) as cohort_date,
datediff(e.event_date, u.registration_date) as days_since_registration
from users u
left join events e
on u.user_id = e.user_id
where e.event_date is not null
),
cohort_activity as (
select
cohort_date,
days_since_registration,
count(distinct user_id) as active_users
from cohort_base
where days_since_registration between 0 and 30
group by cohort_date, days_since_registration
),
cohort_size as (
select
date(registration_date) as cohort_date,
count(distinct user_id) as cohort_size
from users
group by cohort_date
)
select
ca.cohort_date,
ca.days_since_registration,
round(ca.active_users * 100.0 / cs.cohort_size, 2) as retention_rate
from cohort_activity ca
join cohort_size cs
on ca.cohort_date = cs.cohort_date
order by ca.cohort_date, ca.days_since_registration;
cohort_base
- объединяем таблицу пользователей и таблицу событий, чтобы определить, на какой день с момента регистрации пользователь совершил событие.cohort_activity
- считаем количество уникальных пользователей, которые были активны через 0, 1, 7, 14 и 30 дней после регистрации, для каждой когорты.cohort_size
- определяем размер каждой когорты — количество пользователей, зарегистрировавшихся в конкретный день.13 Dec, 08:13
11 Dec, 06:59
import scipy.stats as stats
import numpy as np
control_conversions = 500
control_total = 10000
test_conversions = 555
test_total = 10000
control_rate = control_conversions / control_total
test_rate = test_conversions / test_total
effect = test_rate - control_rate
se = np.sqrt((control_rate * (1 - control_rate)) / control_total +
(test_rate * (1 - test_rate)) / test_total)
z_score = effect / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
ci_low, ci_high = effect - 1.96 * se, effect + 1.96 * se
print('Разница средних, %')
print(np.round(effect, 3) * 100)
print('Доверительный интервал, %')
print([np.round(ci_low, 3) * 100, np.round(ci_high, 3) * 100])
print('p-value')
print(np.round(p_value, 2))
# Разница средних, %
# 0.5
# Доверительный интервал, %
# [-0.1, 1.2]
# p-value
# 0.08
arppu = 1000
low_effect_arppu = ci_low * control_total * arppu
high_effect_arppu = ci_high * control_total * arppu
print([low_effect_arppu, high_effect_arppu])
# [-6955.767415148702, 116955.76741514866]
05 Dec, 11:10
03 Dec, 06:41
import hashlib
user_id = "12345"
hash_value = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
group = "A" if hash_value % 2 == 0 else "B"
print(group)
experiment_name = "new_checkout_flow"
salted_id = f"{experiment_name}_{user_id}"
hash_value = int(hashlib.md5(salted_id.encode()).hexdigest(), 16)
group = "A" if hash_value % 2 == 0 else "B"
print(int(hashlib.md5(f'salt2_{123}'.encode()).hexdigest(), 16)), print(int(hashlib.md5(f'salt9_{123}'.encode()).hexdigest(), 16))
split_percentage = 70
group = "A" if hash_value % 100 < split_percentage else "B"
28 Nov, 07:16
26 Nov, 05:28
24 Nov, 10:01
20 Nov, 17:19
18 Nov, 06:17
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
X, y = make_classification()
lr = LogisticRegression()
lr.fit(X, y)
z = np.dot(X, lr.coef_.reshape(-1, 1)) + lr.intercept_
model_predict = lr.predict_proba(X)[:5, 1]
naive = 1 / (1 + np.exp(-z)).flatten()[:5]
print(model_predict)
print(naive)
12 Nov, 05:30
09 Nov, 09:18
30 Oct, 17:55
25 Oct, 16:13
03 Oct, 06:58
29 Sep, 11:28
26 Sep, 05:53
start_point = i
iterations = n
learning_rate = t
for _ in range(iterations):
start_point -= t * gradient(x)
start_point, func(start_point)
import numpy as np
def loss_function(x):
return x ** 2 * np.sin(x)
def gradient(x):
return 2 * x * np.sin(x) + x**2 * np.cos(x)
def gradient_descent(starting_point, learning_rate, num_iterations):
x = starting_point
for _ in range(num_iterations):
x -= learning_rate * gradient(x)
return x
starting_point = 3 # Начальная точка, выбирается случайно
learning_rate = 0.01 # Скорость обучения
num_iterations = 100 # Количество итераций
minimum = gradient_descent(starting_point, learning_rate, num_iterations)
25 Sep, 16:48
22 Sep, 19:11
26 Aug, 19:22
11 Aug, 09:06
09 Aug, 09:44
04 Aug, 08:21
02 Aug, 17:08
02 Aug, 06:16
01 Aug, 07:11
31 Jul, 06:20
28 Jul, 08:59
23 Jul, 09:27