In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime
from scipy.stats import spearmanr
from sklearn.preprocessing import LabelEncoder

Грузим данные. Они уже отфильтрованы (одна запись от пользователя в сутки, пустые удалены). Заодно спрячем пользовательские id из телеграма.

In [2]:
df = pd.read_csv('results.csv')
le = LabelEncoder()
df['user_id'] = le.fit_transform(df['user_id'])
df
Out[2]:
Unnamed: 0 user_id date sex timezone age city occupation question_time_minutes white ... purple pink lightskyblue beige saddlebrown orchid maroon mediumturquoise coral greenyellow
0 0 93 2021-04-13 12:31:07.731 female 3 28.0 Спб worker 1320.0 0 ... 1 0 0 0 0 0 0 0 0 0
1 1 93 2021-04-14 04:37:16.818 female 3 28.0 Спб worker 1320.0 1 ... 0 0 0 0 0 0 0 0 0 0
2 2 93 2021-04-15 08:01:14.953 female 3 28.0 Спб worker 1320.0 0 ... 0 0 0 0 0 0 0 0 0 0
3 3 93 2021-04-16 22:34:46.480 female 3 28.0 Спб worker 1320.0 0 ... 0 0 0 0 0 1 0 0 0 0
4 4 93 2021-04-17 18:53:59.683 female 3 28.0 Спб worker 1320.0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2343 2343 57 2021-05-16 18:03:51.737 female 3 30.0 Санкт-Петербург worker 1240.0 0 ... 0 1 0 1 0 0 0 0 0 0
2344 2344 57 2021-05-17 17:41:11.994 female 3 30.0 Санкт-Петербург worker 1240.0 0 ... 0 1 0 1 1 0 0 0 0 0
2345 2345 96 2021-05-14 10:07:32.625 female 3 32.0 Санкт-Петербург worker 780.0 0 ... 0 0 1 0 0 0 1 0 0 0
2346 2346 96 2021-05-15 10:15:23.421 female 3 32.0 Санкт-Петербург worker 780.0 0 ... 0 1 1 0 0 0 1 0 0 0
2347 2347 96 2021-05-16 21:04:26.019 female 3 32.0 Санкт-Петербург worker 780.0 1 ... 0 0 0 0 0 0 0 0 0 0

2348 rows × 27 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2348 entries, 0 to 2347
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             2348 non-null   int64  
 1   user_id                2348 non-null   int64  
 2   date                   2348 non-null   object 
 3   sex                    2345 non-null   object 
 4   timezone               2348 non-null   int64  
 5   age                    2338 non-null   float64
 6   city                   2345 non-null   object 
 7   occupation             2338 non-null   object 
 8   question_time_minutes  2338 non-null   float64
 9   white                  2348 non-null   int64  
 10  gray                   2348 non-null   int64  
 11  black                  2348 non-null   int64  
 12  red                    2348 non-null   int64  
 13  blue                   2348 non-null   int64  
 14  yellow                 2348 non-null   int64  
 15  green                  2348 non-null   int64  
 16  orange                 2348 non-null   int64  
 17  purple                 2348 non-null   int64  
 18  pink                   2348 non-null   int64  
 19  lightskyblue           2348 non-null   int64  
 20  beige                  2348 non-null   int64  
 21  saddlebrown            2348 non-null   int64  
 22  orchid                 2348 non-null   int64  
 23  maroon                 2348 non-null   int64  
 24  mediumturquoise        2348 non-null   int64  
 25  coral                  2348 non-null   int64  
 26  greenyellow            2348 non-null   int64  
dtypes: float64(2), int64(21), object(4)
memory usage: 495.4+ KB
In [4]:
df['city'].unique()
Out[4]:
array(['Спб', 'Санкт-Петербург', nan, 'Москва', 'С-Пб', 'СПб', 'Berlin',
       '/send_colors', 'Петербург', 'Новосибирск', 'москва', 'Питер',
       'спб', 'Екатеринбург', 'Севастополь', 'санкт-петербург',
       'Санкт-Петербурга', 'Уппсала', 'Копенгаген', 'Дармштадт',
       'Мурманск', 'Чанчунь', 'Омск'], dtype=object)

Немного чиним данные

In [5]:
def get_occupation(x):
    if x == 'kid':
        return 0
    if x == 'student':
        return 1
    if x == 'worker':
        return 2
    if x == 'retiree':
        return 3
    return 4


def get_city(city):
    if type(city) != str:
        return 'unknown'
    if city in [ 'Петербург',
                 'петербург',
                 'Питер',
                 'питер'
                 'С-Пб',
                 'СПб',
                 'спб',
                 'с-пб',
                 'Санкт-Петербург',
                 'Санкт-Петербурга',
                 'Спб',
                 'санкт-петербург',
                 'спб' ]:
        return 'spb'
    if city.lower() in ['москва', 'moskow']:
        return 'msk'
    if city.lower() == 'екатеринбург':
        return 'ekb'
    if city.lower() == 'новосибирск':
        return 'nsk'
    if city.lower() == 'севастополь':
        return 'sev'
    return 'other'


def get_weekday(year, month, day):
    return datetime(year=year, month=month, day=day).weekday()

def get_day_number(year, month, day):
    delta = datetime(year=year, month=month, day=day) - datetime(year=2021, month=4, day=1)
    return delta.days


# удаляем индекс
df = df.drop('Unnamed: 0', axis=1)

# строим по дате признаки (а она у нас в виде строки изначально)
df['year'] = [int(x[:4]) for x in df['date']]
df['month'] = [int(x[5:7]) for x in df['date']]
df['day'] = [int(x[8:10]) for x in df['date']]
df['day_number'] = [get_day_number(int(x[:4]), int(x[5:7]), int(x[8:10])) for x in df['date']]
df['weekday'] = [get_weekday(int(x[:4]), int(x[5:7]), int(x[8:10])) for x in df['date']]
df = df.drop('date', axis=1)

# считаем, что неизвестная временная зона - это +3 
df['timezone'].fillna(3, inplace=True)

# заполняем неизвестный род занятий
df['occupation'].fillna('unknown', inplace=True)

# и неизвестный пол
df['sex'].fillna('unknown', inplace=True)

# стандартизируем город
df['city'] = [get_city(x) for x in df['city']]

# заполняем пропуски
df.fillna(df.median(), inplace=True)

# смотрим, что все хорошо заполнилось
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2348 entries, 0 to 2347
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                2348 non-null   int64  
 1   sex                    2348 non-null   object 
 2   timezone               2348 non-null   int64  
 3   age                    2348 non-null   float64
 4   city                   2348 non-null   object 
 5   occupation             2348 non-null   object 
 6   question_time_minutes  2348 non-null   float64
 7   white                  2348 non-null   int64  
 8   gray                   2348 non-null   int64  
 9   black                  2348 non-null   int64  
 10  red                    2348 non-null   int64  
 11  blue                   2348 non-null   int64  
 12  yellow                 2348 non-null   int64  
 13  green                  2348 non-null   int64  
 14  orange                 2348 non-null   int64  
 15  purple                 2348 non-null   int64  
 16  pink                   2348 non-null   int64  
 17  lightskyblue           2348 non-null   int64  
 18  beige                  2348 non-null   int64  
 19  saddlebrown            2348 non-null   int64  
 20  orchid                 2348 non-null   int64  
 21  maroon                 2348 non-null   int64  
 22  mediumturquoise        2348 non-null   int64  
 23  coral                  2348 non-null   int64  
 24  greenyellow            2348 non-null   int64  
 25  year                   2348 non-null   int64  
 26  month                  2348 non-null   int64  
 27  day                    2348 non-null   int64  
 28  day_number             2348 non-null   int64  
 29  weekday                2348 non-null   int64  
dtypes: float64(2), int64(25), object(3)
memory usage: 550.4+ KB
In [6]:
df
Out[6]:
user_id sex timezone age city occupation question_time_minutes white gray black ... orchid maroon mediumturquoise coral greenyellow year month day day_number weekday
0 93 female 3 28.0 spb worker 1320.0 0 0 1 ... 0 0 0 0 0 2021 4 13 12 1
1 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 0 2021 4 14 13 2
2 93 female 3 28.0 spb worker 1320.0 0 1 1 ... 0 0 0 0 0 2021 4 15 14 3
3 93 female 3 28.0 spb worker 1320.0 0 0 0 ... 1 0 0 0 0 2021 4 16 15 4
4 93 female 3 28.0 spb worker 1320.0 0 0 1 ... 0 0 0 0 0 2021 4 17 16 5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2343 57 female 3 30.0 spb worker 1240.0 0 0 1 ... 0 0 0 0 0 2021 5 16 45 6
2344 57 female 3 30.0 spb worker 1240.0 0 0 0 ... 0 0 0 0 0 2021 5 17 46 0
2345 96 female 3 32.0 spb worker 780.0 0 0 0 ... 0 1 0 0 0 2021 5 14 43 4
2346 96 female 3 32.0 spb worker 780.0 0 0 0 ... 0 1 0 0 0 2021 5 15 44 5
2347 96 female 3 32.0 spb worker 780.0 1 0 1 ... 0 0 0 0 0 2021 5 16 45 6

2348 rows × 30 columns

Всего у нас 2348 уникальных записи. Посмотрим на данные (распределение разных признаков по записям и по людям). Для начала создадим новый датафрейм с информацией о каждом пользователе

In [7]:
person_features = ['user_id', 'sex', 'timezone', 'age', 'city', 'occupation']
colors = ['white', 'gray', 'black', 'red', 'blue', 'yellow', 'green', 'orange', 'purple', 'pink', 'lightskyblue', 'beige',
          'saddlebrown', 'orchid', 'maroon', 'mediumturquoise', 'coral', 'greenyellow']
In [8]:
df_persons = df[person_features].groupby('user_id').agg(pd.Series.mode)
df_persons
Out[8]:
sex timezone age city occupation
user_id
0 female 3 28.0 spb worker
1 female 3 34.0 spb worker
2 male 3 29.0 spb worker
3 female 3 30.0 spb worker
4 male 3 30.0 spb worker
... ... ... ... ... ...
123 female 8 34.0 other worker
124 female 3 22.0 spb worker
125 male 3 33.0 spb worker
126 female 3 77.0 spb retiree
127 female 3 38.0 spb worker

128 rows × 5 columns

In [9]:
for feature in ['sex', 'timezone', 'age', 'city', 'occupation']:
    plt.hist(df_persons[feature], bins=min(len(df_persons[feature].unique()), 10), edgecolor='black', alpha=0.7, color='springgreen')
    plt.xlabel(feature)
    plt.ylabel('number of persons')
    plt.show()

По записям также посмотрим на день недели, день с начала существования бота и так далее

In [10]:
for feature in ['sex', 'timezone', 'age', 'city', 'occupation', 'month', 'day', 'day_number', 'weekday']:
    plt.hist(df[feature], bins=min(len(df[feature].unique()), 10), edgecolor='black', alpha=0.7, color='green')
    plt.xlabel(feature)
    plt.ylabel('number of records')
    plt.show()

Для красоты посмотрим еще на общий графи распределения цветов по дням

In [11]:
bins = len(df['day_number'].unique())

fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
ax.set_facecolor('linen')
plt.hist([df[df[color] == 1]['day_number'] for color in colors],
         bins=bins,
         stacked=True,
         color=colors,
         edgecolor='black')
plt.xlabel('day number')
plt.ylabel('colors')

plt.show()

Создадим еще один датафрейм, где каждому человеку будем сопоставлять частоту ношения им каждого цвета (от 0 до 1, 0 - не носит никогда, 0.5 - через день, 1.0 - ежедневно)

In [12]:
df_freqs = df.groupby('user_id').sum()[colors] / df.groupby('user_id').count()[colors]
df_freqs = df_freqs.merge(df_persons, on=['user_id'])
df_freqs
Out[12]:
white gray black red blue yellow green orange purple pink ... orchid maroon mediumturquoise coral greenyellow sex timezone age city occupation
user_id
0 0.142857 0.642857 0.785714 0.071429 0.357143 0.000000 0.000000 0.000000 0.285714 0.214286 ... 0.00000 0.000000 0.285714 0.071429 0.00000 female 3 28.0 spb worker
1 0.333333 0.500000 0.333333 0.000000 0.500000 0.333333 0.333333 0.000000 0.000000 0.000000 ... 0.00000 0.166667 0.000000 0.000000 0.00000 female 3 34.0 spb worker
2 0.514286 0.285714 0.314286 0.085714 0.828571 0.085714 0.000000 0.000000 0.000000 0.028571 ... 0.00000 0.000000 0.000000 0.028571 0.00000 male 3 29.0 spb worker
3 0.200000 0.133333 0.333333 0.133333 0.333333 0.000000 0.266667 0.000000 0.200000 0.066667 ... 0.00000 0.000000 0.000000 0.000000 0.00000 female 3 30.0 spb worker
4 0.000000 0.133333 0.833333 0.400000 0.666667 0.000000 0.033333 0.000000 0.000000 0.000000 ... 0.00000 0.566667 0.000000 0.000000 0.00000 male 3 30.0 spb worker
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
123 0.000000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.500000 0.000000 0.500000 ... 0.00000 0.500000 0.000000 0.000000 0.00000 female 8 34.0 other worker
124 0.343750 0.531250 0.843750 0.000000 0.062500 0.031250 0.125000 0.031250 0.031250 0.125000 ... 0.00000 0.093750 0.000000 0.000000 0.15625 female 3 22.0 spb worker
125 0.516129 0.870968 0.419355 0.129032 0.096774 0.096774 1.000000 0.032258 0.000000 0.161290 ... 0.00000 0.000000 0.000000 0.000000 0.00000 male 3 33.0 spb worker
126 1.000000 0.666667 0.333333 0.000000 0.000000 0.000000 1.000000 0.000000 0.333333 0.000000 ... 0.00000 0.000000 0.000000 0.000000 0.00000 female 3 77.0 spb retiree
127 0.354839 0.225806 0.645161 0.096774 0.645161 0.161290 0.419355 0.032258 0.032258 0.064516 ... 0.16129 0.000000 0.032258 0.000000 0.00000 female 3 38.0 spb worker

128 rows × 23 columns

Для каждого цвета посмотрим на количество пользователей, которые его не надевали ни разу.

In [13]:
color_lovers = [(np.sum(df_freqs[color] == 0), color) for color in colors]
color_lovers.sort(key=lambda x: x[0])

fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([x[0] for x in color_lovers], [x[1] for x in color_lovers], color=[x[1] for x in color_lovers], edgecolor='black', s=60)
ax.set_yticklabels([x[1] for x in color_lovers],
                   fontsize = 15)
plt.xlabel("Number of people who don't wear this color")
plt.show()
In [14]:
print('Минимальное количество не носивших:', color_lovers[0][0], color_lovers[0][1])
print('Мaксимальное количество не носивших:', color_lovers[-1][0], color_lovers[-1][1])
Минимальное количество не носивших: 11 black
Мaксимальное количество не носивших: 115 greenyellow

Исследуем связь пола и цвета. Для начала разделим мужчин и женщин и посмотрим с помощью непараметрического теста Манна-Уитни, одинаково ли они выбирают цвета

In [15]:
df_males = df_freqs[df_freqs['sex'] == 'male'].drop('sex', axis=1)
df_females = df_freqs[df_freqs['sex'] == 'female'].drop('sex', axis=1)

df_males.shape
Out[15]:
(38, 22)
In [16]:
df_females.shape
Out[16]:
(88, 22)

Посмотрим на нормированные гистограммы частот ношения цветов

In [17]:
from scipy.stats import mannwhitneyu

color_diffs = []
alpha = 0.05

for color in colors:
    plt.hist(df_females[color], alpha=0.7, color='pink', bins=10, density=True)
    plt.hist(df_males[color], alpha=0.4, color='blue', bins=10, density=True)
    stat, p = mannwhitneyu(df_females[color], df_males[color])
    color_diffs.append((color, stat, p))
    plt.xlabel(color + ', ' + 'stat=%.3f, p=%.3f' % (stat, p))
    plt.show()
In [18]:
color_diffs.sort(key=lambda x: -x[2])


for record in color_diffs:
    print(f'{record[0]:<16}: stat = %.3f, p = %.3f' % (record[1], record[2]))
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in color_diffs],
           [record[0] for record in color_diffs],
           edgecolor='black',
           color=[x[0] for x in color_diffs],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in color_diffs])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in color_diffs],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
coral           : stat = 1671.500, p = 0.500
black           : stat = 1604.000, p = 0.360
saddlebrown     : stat = 1590.500, p = 0.319
white           : stat = 1581.000, p = 0.315
yellow          : stat = 1570.000, p = 0.272
orange          : stat = 1543.500, p = 0.205
green           : stat = 1487.500, p = 0.161
lightskyblue    : stat = 1476.000, p = 0.146
greenyellow     : stat = 1554.000, p = 0.118
blue            : stat = 1418.500, p = 0.089
maroon          : stat = 1378.000, p = 0.043
gray            : stat = 1272.500, p = 0.017
orchid          : stat = 1382.500, p = 0.009
red             : stat = 1237.500, p = 0.008
beige           : stat = 1206.000, p = 0.005
mediumturquoise : stat = 1279.500, p = 0.002
purple          : stat = 1204.500, p = 0.002
pink            : stat = 778.000, p = 0.000

Это не очень удивительно, но женщины и мужчины неодинаково носят розово-фиолетово-бордовую гамму цветов, а также цвет морской волны, голубой и бежевый (все это женщины носят чаще). Красный, как оказалось, чаще носят мужчины, а вот почему так с серым - непонятно.

Теперь посмотрим на возраст + цвета

In [19]:
for color in colors:
    plt.scatter(df_freqs['age'], df_freqs[color], color='darkblue', s=3)
    plt.xlabel(color)
    plt.ylabel('frequency of color')
    plt.show()

Здесь мы посчитаем ранговую корреляцию Спирмена, потому что распределения возрастов и частоты цветов в ответах пользователя не нормальны

In [20]:
colors_diffs_in_age = []

for color in colors:
    coef, p = spearmanr(df_freqs['age'], df_freqs[color])
    colors_diffs_in_age.append((color, coef, p))
        
colors_diffs_in_age.sort(key=lambda x: -x[2])
        
for record in colors_diffs_in_age:
    print(f'{record[0]:<16}: coef = %.3f, p = %.3f' % (record[1], record[2]))
    
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in colors_diffs_in_age],
           [record[0] for record in colors_diffs_in_age],
           edgecolor='black',
           color=[x[0] for x in colors_diffs_in_age],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in colors_diffs_in_age])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in colors_diffs_in_age],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
coral           : coef = 0.004, p = 0.961
lightskyblue    : coef = 0.012, p = 0.894
gray            : coef = -0.017, p = 0.853
pink            : coef = -0.018, p = 0.841
maroon          : coef = 0.034, p = 0.702
mediumturquoise : coef = 0.043, p = 0.629
orange          : coef = -0.066, p = 0.457
orchid          : coef = 0.071, p = 0.428
purple          : coef = 0.084, p = 0.348
greenyellow     : coef = -0.105, p = 0.238
beige           : coef = -0.131, p = 0.140
saddlebrown     : coef = 0.133, p = 0.135
green           : coef = 0.152, p = 0.087
blue            : coef = 0.157, p = 0.077
yellow          : coef = -0.166, p = 0.061
red             : coef = -0.174, p = 0.049
white           : coef = -0.253, p = 0.004
black           : coef = -0.324, p = 0.000

Красный, белый и черный. Чем старше человек, тем меньше драматичности в его одежде! Но есть версия, что на результат могли повлить выбросы. Раз уж красный цвет буквально ходит по грани, попробуем от них избавиться и проделать все то же самое.

In [21]:
df_freqs_new = df_freqs[df_freqs['age'] < 50]
df_freqs_new.shape
Out[21]:
(126, 23)
In [22]:
for color in colors:
    plt.scatter(df_freqs_new['age'], df_freqs_new[color], color='darkblue', s=3)
    plt.xlabel(color)
    plt.show()
In [23]:
colors_diffs_in_age_new = []

for color in colors:
    coef, p = spearmanr(df_freqs_new['age'], df_freqs_new[color])
    colors_diffs_in_age_new.append((color, coef, p))
        
colors_diffs_in_age_new.sort(key=lambda x: -x[2])
        
for record in colors_diffs_in_age_new:
    print(f'{record[0]:<16}: coef = %.3f, p = %.3f' % (record[1], record[2]))
    
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in colors_diffs_in_age_new],
           [record[0] for record in colors_diffs_in_age_new],
           edgecolor='black',
           color=[x[0] for x in colors_diffs_in_age_new],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in colors_diffs_in_age_new])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in colors_diffs_in_age_new],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
mediumturquoise : coef = 0.023, p = 0.799
pink            : coef = -0.023, p = 0.797
coral           : coef = -0.032, p = 0.726
lightskyblue    : coef = 0.038, p = 0.674
purple          : coef = 0.039, p = 0.669
maroon          : coef = 0.042, p = 0.643
orange          : coef = -0.049, p = 0.586
orchid          : coef = 0.053, p = 0.559
gray            : coef = -0.061, p = 0.495
greenyellow     : coef = -0.098, p = 0.274
beige           : coef = -0.125, p = 0.162
saddlebrown     : coef = 0.136, p = 0.129
green           : coef = 0.137, p = 0.125
red             : coef = -0.150, p = 0.095
yellow          : coef = -0.166, p = 0.063
blue            : coef = 0.192, p = 0.032
white           : coef = -0.304, p = 0.001
black           : coef = -0.316, p = 0.000

Результаты и правда поменялись! Черный и белый все так же привлекают молодежь, а вот красный сменился синим, причем с другим знаком.

Теперь посмотрим на сочетания цветов.

In [24]:
import itertools
colors_together = []

results = []
for pair in itertools.combinations(colors, 2):
    coef, p = spearmanr(df_freqs[pair[0]], df_freqs[pair[1]])
    if p < alpha:
        colors_together.append((pair, coef, p))

colors_together.sort(key=lambda x: x[2])
        
for record in colors_together:
    comment_string = 'Носим вместе!\n' if record[1] > 0 else 'Лучше не сочетать!\n'
    explode = [0.1] * 2 if record[1] < 0 else None
    plt.pie([1, 1], colors=record[0], shadow=True, startangle=90, explode=explode)
    plt.xlabel(comment_string + 'coef = %.3f, p = %.3f' % (record[1], record[2]))
    plt.show()

Модные сочетания: носим оранжевый с салатовый, а фиолетовый с сиреневым. Избегаем синего с бежевым и белым! То, что черный отрицательно связан с синим и голубым, думаю, связано с тремя основными цветами джинсов.

Просто ради любопытства посмотрим для всех цветов, как меняется вероятность их встретить при условии, что какой-то другой цвет мы уже надели

In [25]:
results = []
for pair in itertools.permutations(colors, 2):
    p_color1 = df[pair[1]].sum() / df.shape[0]
    df_color0 = df[df[pair[0]] == 1]
    p_color1_cond = df_color0[pair[1]].sum() / df_color0.shape[0]
    if abs(p_color1 - p_color1_cond) >= 0.15:
        print(f'{pair[1]}: {round(p_color1, 2)}, with {pair[0]}: {round(p_color1_cond, 2)}')
white: 0.34, with orange: 0.18
blue: 0.44, with pink: 0.28
blue: 0.44, with beige: 0.26
gray: 0.31, with orchid: 0.11
black: 0.52, with mediumturquoise: 0.3
black: 0.52, with coral: 0.24
blue: 0.44, with greenyellow: 0.06
pink: 0.13, with greenyellow: 0.32

Заметим, что второй (фоновый) цвет при резком изменении вероятности скорее из редких, а первый, для которого вероятность считаем - из "базовых". Многое подтверждается тестами выше.

К сожалению, для поиска интересного по городам и роду деятельности пока слишком мало представлено разных категорий, потому что мало пользователей, а для адекватного отражения сезонности бот существует слишком мало времени. Попробуем посмотреть на цвета + температуру воздуха в СПб. Температуру возьмем с https://www.gismeteo.ru/diary/

In [26]:
degrees = [5, 6, 4, 4, 7, 4, 6, 3, 7, 6, 11, 19, 11, 7, 9, 12, 15, 15, 12, 12, 4, 13, 8, 6, 5, 5, 4, 5, 7, 12, 9, 7, 5, 3, 8,
           7, 8, 3, 11, 15, 25, 27, 24, 20, 22, 19]

df_spb = df[df['city'] == 'spb']
df_spb.shape
Out[26]:
(1912, 30)
In [27]:
df_spb['degrees'] = [degrees[x - 1] for x in df_spb['day_number']]
df_spb
C:\Users\Pusheen\anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
Out[27]:
user_id sex timezone age city occupation question_time_minutes white gray black ... maroon mediumturquoise coral greenyellow year month day day_number weekday degrees
0 93 female 3 28.0 spb worker 1320.0 0 0 1 ... 0 0 0 0 2021 4 13 12 1 19
1 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 2021 4 14 13 2 11
2 93 female 3 28.0 spb worker 1320.0 0 1 1 ... 0 0 0 0 2021 4 15 14 3 7
3 93 female 3 28.0 spb worker 1320.0 0 0 0 ... 0 0 0 0 2021 4 16 15 4 9
4 93 female 3 28.0 spb worker 1320.0 0 0 1 ... 0 0 0 0 2021 4 17 16 5 12
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2343 57 female 3 30.0 spb worker 1240.0 0 0 1 ... 0 0 0 0 2021 5 16 45 6 22
2344 57 female 3 30.0 spb worker 1240.0 0 0 0 ... 0 0 0 0 2021 5 17 46 0 19
2345 96 female 3 32.0 spb worker 780.0 0 0 0 ... 1 0 0 0 2021 5 14 43 4 24
2346 96 female 3 32.0 spb worker 780.0 0 0 0 ... 1 0 0 0 2021 5 15 44 5 20
2347 96 female 3 32.0 spb worker 780.0 1 0 1 ... 0 0 0 0 2021 5 16 45 6 22

1912 rows × 31 columns

In [28]:
colors_that_diff_when_warm = []

for color in colors:
    coef, p = spearmanr(df_spb['degrees'], df_spb[color])
    colors_that_diff_when_warm.append((color, coef, p))
        
colors_that_diff_when_warm.sort(key=lambda x: -x[2])
        
for record in colors_that_diff_when_warm:
    print(f'{record[0]:<8}: coef = %.3f, p = %.3f' % (record[1], record[2]))

      
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in colors_that_diff_when_warm],
           [record[0] for record in colors_that_diff_when_warm],
           edgecolor='black',
           color=[x[0] for x in colors_that_diff_when_warm],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in colors_that_diff_when_warm])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in colors_that_diff_when_warm],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
pink    : coef = -0.002, p = 0.935
orchid  : coef = 0.004, p = 0.848
maroon  : coef = -0.006, p = 0.798
gray    : coef = 0.007, p = 0.763
purple  : coef = 0.008, p = 0.731
yellow  : coef = 0.008, p = 0.717
beige   : coef = 0.011, p = 0.628
mediumturquoise: coef = 0.013, p = 0.579
orange  : coef = -0.017, p = 0.469
blue    : coef = -0.020, p = 0.373
green   : coef = 0.030, p = 0.189
red     : coef = -0.033, p = 0.152
greenyellow: coef = 0.034, p = 0.137
coral   : coef = 0.045, p = 0.051
lightskyblue: coef = 0.048, p = 0.036
saddlebrown: coef = -0.063, p = 0.006
white   : coef = 0.072, p = 0.002
black   : coef = -0.077, p = 0.001

Что ж, выглядит правдоподобно, хотя корреляции и малы. Черный и коричневый носим меньше, белый и голубой - больше!

Наконец, посмотрим на выходные и будние дни. В этом году все было сложно с 4-7 мая, поэтому мы их выкинем :)

In [29]:
df_weekends = df[df['weekday'] >= 5]
df_3_10 = df[df['day'] == 3]
df_3_10 = df_3_10[df_3_10['month'] == 5]
df_3_10

df_weekends = pd.concat([df_weekends, df_3_10])
df_weekends
Out[29]:
user_id sex timezone age city occupation question_time_minutes white gray black ... orchid maroon mediumturquoise coral greenyellow year month day day_number weekday
4 93 female 3 28.0 spb worker 1320.0 0 0 1 ... 0 0 0 0 0 2021 4 17 16 5
5 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 0 2021 4 18 17 6
11 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 0 2021 4 24 23 5
12 93 female 3 28.0 spb worker 1320.0 0 1 1 ... 0 0 0 0 0 2021 4 25 24 6
18 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 0 2021 5 1 30 5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2183 68 female 2 35.0 other worker 735.0 0 1 0 ... 0 0 0 0 0 2021 5 3 32 0
2201 63 female 3 21.0 spb worker 960.0 0 0 1 ... 0 1 0 0 0 2021 5 3 32 0
2218 55 female 3 24.0 spb worker 1200.0 0 1 0 ... 0 0 0 0 0 2021 5 3 32 0
2234 33 female 2 27.0 other worker 780.0 1 0 1 ... 1 0 0 0 0 2021 5 3 32 0
2236 0 female 3 28.0 spb worker 1260.0 0 1 1 ... 0 0 0 0 0 2021 5 3 32 0

746 rows × 30 columns

In [30]:
df_workdays = df[df['weekday'] < 5]
df_workdays = df_workdays[df_workdays['month'] == 4]

df_march = df[df['month'] == 5]
df_march = df_march[df_march['day'] > 10]
df_march = df_march[df_march['weekday'] < 5]

df_workdays = pd.concat([df_workdays, df_march])
df_workdays
Out[30]:
user_id sex timezone age city occupation question_time_minutes white gray black ... orchid maroon mediumturquoise coral greenyellow year month day day_number weekday
0 93 female 3 28.0 spb worker 1320.0 0 0 1 ... 0 0 0 0 0 2021 4 13 12 1
1 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 0 2021 4 14 13 2
2 93 female 3 28.0 spb worker 1320.0 0 1 1 ... 0 0 0 0 0 2021 4 15 14 3
3 93 female 3 28.0 spb worker 1320.0 0 0 0 ... 1 0 0 0 0 2021 4 16 15 4
6 93 female 3 28.0 spb worker 1320.0 1 0 1 ... 0 0 0 0 0 2021 4 19 18 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2337 26 male 3 34.0 spb worker 1005.0 0 1 0 ... 0 0 0 0 0 2021 5 14 43 4
2340 57 female 3 30.0 spb worker 1240.0 0 0 0 ... 0 0 0 0 0 2021 5 13 42 3
2341 57 female 3 30.0 spb worker 1240.0 0 0 0 ... 0 0 0 0 0 2021 5 14 43 4
2344 57 female 3 30.0 spb worker 1240.0 0 0 0 ... 0 0 0 0 0 2021 5 17 46 0
2345 96 female 3 32.0 spb worker 780.0 0 0 0 ... 0 1 0 0 0 2021 5 14 43 4

1323 rows × 30 columns

In [31]:
df_freqs_weekends = df_weekends.groupby('day_number').sum()[colors] / df_weekends.groupby('day_number').count()[colors]
df_freqs_weekends.shape
Out[31]:
(14, 18)
In [32]:
df_freqs_workdays = df_workdays.groupby('day_number').sum()[colors] / df_workdays.groupby('day_number').count()[colors]
df_freqs_workdays.shape
Out[32]:
(25, 18)
In [33]:
color_diffs = []
alpha = 0.05

for color in colors:
    stat, p = mannwhitneyu(df_freqs_weekends[color], df_freqs_workdays[color])
    color_diffs.append((color, stat, p))

color_diffs.sort(key=lambda x: -x[2])


for record in color_diffs:
    print(f'{record[0]:<16}: stat = %.3f, p = %.3f' % (record[1], record[2]))
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in color_diffs],
           [record[0] for record in color_diffs],
           edgecolor='black',
           color=[x[0] for x in color_diffs],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in color_diffs])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in color_diffs],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
green           : stat = 159.000, p = 0.325
orange          : stat = 156.000, p = 0.294
maroon          : stat = 150.500, p = 0.240
greenyellow     : stat = 151.000, p = 0.227
pink            : stat = 147.000, p = 0.210
white           : stat = 146.500, p = 0.206
blue            : stat = 146.000, p = 0.202
red             : stat = 145.500, p = 0.198
mediumturquoise : stat = 144.500, p = 0.185
yellow          : stat = 143.000, p = 0.178
gray            : stat = 142.000, p = 0.171
coral           : stat = 135.000, p = 0.114
lightskyblue    : stat = 129.000, p = 0.091
purple          : stat = 121.000, p = 0.059
saddlebrown     : stat = 118.500, p = 0.051
orchid          : stat = 104.500, p = 0.019
black           : stat = 102.000, p = 0.017
beige           : stat = 82.000, p = 0.003

Что бы это могло значить? Непонятно. Будем вести наблюдения дальше!