import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime
from scipy.stats import spearmanr
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('results.csv')
le = LabelEncoder()
df['user_id'] = le.fit_transform(df['user_id'])
df
df.info()
df['city'].unique()
def get_occupation(x):
    if x == 'kid':
        return 0
    if x == 'student':
        return 1
    if x == 'worker':
        return 2
    if x == 'retiree':
        return 3
    return 4
def get_city(city):
    if type(city) != str:
        return 'unknown'
    if city in [ 'Петербург',
                 'петербург',
                 'Питер',
                 'питер'
                 'С-Пб',
                 'СПб',
                 'спб',
                 'с-пб',
                 'Санкт-Петербург',
                 'Санкт-Петербурга',
                 'Спб',
                 'санкт-петербург',
                 'спб' ]:
        return 'spb'
    if city.lower() in ['москва', 'moskow']:
        return 'msk'
    if city.lower() == 'екатеринбург':
        return 'ekb'
    if city.lower() == 'новосибирск':
        return 'nsk'
    if city.lower() == 'севастополь':
        return 'sev'
    return 'other'
def get_weekday(year, month, day):
    return datetime(year=year, month=month, day=day).weekday()
def get_day_number(year, month, day):
    delta = datetime(year=year, month=month, day=day) - datetime(year=2021, month=4, day=1)
    return delta.days
# удаляем индекс
df = df.drop('Unnamed: 0', axis=1)
# строим по дате признаки (а она у нас в виде строки изначально)
df['year'] = [int(x[:4]) for x in df['date']]
df['month'] = [int(x[5:7]) for x in df['date']]
df['day'] = [int(x[8:10]) for x in df['date']]
df['day_number'] = [get_day_number(int(x[:4]), int(x[5:7]), int(x[8:10])) for x in df['date']]
df['weekday'] = [get_weekday(int(x[:4]), int(x[5:7]), int(x[8:10])) for x in df['date']]
df = df.drop('date', axis=1)
# считаем, что неизвестная временная зона - это +3 
df['timezone'].fillna(3, inplace=True)
# заполняем неизвестный род занятий
df['occupation'].fillna('unknown', inplace=True)
# и неизвестный пол
df['sex'].fillna('unknown', inplace=True)
# стандартизируем город
df['city'] = [get_city(x) for x in df['city']]
# заполняем пропуски
df.fillna(df.median(), inplace=True)
# смотрим, что все хорошо заполнилось
df.info()
df
person_features = ['user_id', 'sex', 'timezone', 'age', 'city', 'occupation']
colors = ['white', 'gray', 'black', 'red', 'blue', 'yellow', 'green', 'orange', 'purple', 'pink', 'lightskyblue', 'beige',
          'saddlebrown', 'orchid', 'maroon', 'mediumturquoise', 'coral', 'greenyellow']
df_persons = df[person_features].groupby('user_id').agg(pd.Series.mode)
df_persons
for feature in ['sex', 'timezone', 'age', 'city', 'occupation']:
    plt.hist(df_persons[feature], bins=min(len(df_persons[feature].unique()), 10), edgecolor='black', alpha=0.7, color='springgreen')
    plt.xlabel(feature)
    plt.ylabel('number of persons')
    plt.show()
for feature in ['sex', 'timezone', 'age', 'city', 'occupation', 'month', 'day', 'day_number', 'weekday']:
    plt.hist(df[feature], bins=min(len(df[feature].unique()), 10), edgecolor='black', alpha=0.7, color='green')
    plt.xlabel(feature)
    plt.ylabel('number of records')
    plt.show()
bins = len(df['day_number'].unique())
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
ax.set_facecolor('linen')
plt.hist([df[df[color] == 1]['day_number'] for color in colors],
         bins=bins,
         stacked=True,
         color=colors,
         edgecolor='black')
plt.xlabel('day number')
plt.ylabel('colors')
plt.show()
df_freqs = df.groupby('user_id').sum()[colors] / df.groupby('user_id').count()[colors]
df_freqs = df_freqs.merge(df_persons, on=['user_id'])
df_freqs
color_lovers = [(np.sum(df_freqs[color] == 0), color) for color in colors]
color_lovers.sort(key=lambda x: x[0])
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([x[0] for x in color_lovers], [x[1] for x in color_lovers], color=[x[1] for x in color_lovers], edgecolor='black', s=60)
ax.set_yticklabels([x[1] for x in color_lovers],
                   fontsize = 15)
plt.xlabel("Number of people who don't wear this color")
plt.show()
print('Минимальное количество не носивших:', color_lovers[0][0], color_lovers[0][1])
print('Мaксимальное количество не носивших:', color_lovers[-1][0], color_lovers[-1][1])
df_males = df_freqs[df_freqs['sex'] == 'male'].drop('sex', axis=1)
df_females = df_freqs[df_freqs['sex'] == 'female'].drop('sex', axis=1)
df_males.shape
df_females.shape
from scipy.stats import mannwhitneyu
color_diffs = []
alpha = 0.05
for color in colors:
    plt.hist(df_females[color], alpha=0.7, color='pink', bins=10, density=True)
    plt.hist(df_males[color], alpha=0.4, color='blue', bins=10, density=True)
    stat, p = mannwhitneyu(df_females[color], df_males[color])
    color_diffs.append((color, stat, p))
    plt.xlabel(color + ', ' + 'stat=%.3f, p=%.3f' % (stat, p))
    plt.show()
color_diffs.sort(key=lambda x: -x[2])
for record in color_diffs:
    print(f'{record[0]:<16}: stat = %.3f, p = %.3f' % (record[1], record[2]))
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in color_diffs],
           [record[0] for record in color_diffs],
           edgecolor='black',
           color=[x[0] for x in color_diffs],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in color_diffs])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in color_diffs],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
for color in colors:
    plt.scatter(df_freqs['age'], df_freqs[color], color='darkblue', s=3)
    plt.xlabel(color)
    plt.ylabel('frequency of color')
    plt.show()
colors_diffs_in_age = []
for color in colors:
    coef, p = spearmanr(df_freqs['age'], df_freqs[color])
    colors_diffs_in_age.append((color, coef, p))
        
colors_diffs_in_age.sort(key=lambda x: -x[2])
        
for record in colors_diffs_in_age:
    print(f'{record[0]:<16}: coef = %.3f, p = %.3f' % (record[1], record[2]))
    
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in colors_diffs_in_age],
           [record[0] for record in colors_diffs_in_age],
           edgecolor='black',
           color=[x[0] for x in colors_diffs_in_age],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in colors_diffs_in_age])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in colors_diffs_in_age],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
df_freqs_new = df_freqs[df_freqs['age'] < 50]
df_freqs_new.shape
for color in colors:
    plt.scatter(df_freqs_new['age'], df_freqs_new[color], color='darkblue', s=3)
    plt.xlabel(color)
    plt.show()
colors_diffs_in_age_new = []
for color in colors:
    coef, p = spearmanr(df_freqs_new['age'], df_freqs_new[color])
    colors_diffs_in_age_new.append((color, coef, p))
        
colors_diffs_in_age_new.sort(key=lambda x: -x[2])
        
for record in colors_diffs_in_age_new:
    print(f'{record[0]:<16}: coef = %.3f, p = %.3f' % (record[1], record[2]))
    
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in colors_diffs_in_age_new],
           [record[0] for record in colors_diffs_in_age_new],
           edgecolor='black',
           color=[x[0] for x in colors_diffs_in_age_new],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in colors_diffs_in_age_new])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in colors_diffs_in_age_new],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
import itertools
colors_together = []
results = []
for pair in itertools.combinations(colors, 2):
    coef, p = spearmanr(df_freqs[pair[0]], df_freqs[pair[1]])
    if p < alpha:
        colors_together.append((pair, coef, p))
colors_together.sort(key=lambda x: x[2])
        
for record in colors_together:
    comment_string = 'Носим вместе!\n' if record[1] > 0 else 'Лучше не сочетать!\n'
    explode = [0.1] * 2 if record[1] < 0 else None
    plt.pie([1, 1], colors=record[0], shadow=True, startangle=90, explode=explode)
    plt.xlabel(comment_string + 'coef = %.3f, p = %.3f' % (record[1], record[2]))
    plt.show()
results = []
for pair in itertools.permutations(colors, 2):
    p_color1 = df[pair[1]].sum() / df.shape[0]
    df_color0 = df[df[pair[0]] == 1]
    p_color1_cond = df_color0[pair[1]].sum() / df_color0.shape[0]
    if abs(p_color1 - p_color1_cond) >= 0.15:
        print(f'{pair[1]}: {round(p_color1, 2)}, with {pair[0]}: {round(p_color1_cond, 2)}')
degrees = [5, 6, 4, 4, 7, 4, 6, 3, 7, 6, 11, 19, 11, 7, 9, 12, 15, 15, 12, 12, 4, 13, 8, 6, 5, 5, 4, 5, 7, 12, 9, 7, 5, 3, 8,
           7, 8, 3, 11, 15, 25, 27, 24, 20, 22, 19]
df_spb = df[df['city'] == 'spb']
df_spb.shape
df_spb['degrees'] = [degrees[x - 1] for x in df_spb['day_number']]
df_spb
colors_that_diff_when_warm = []
for color in colors:
    coef, p = spearmanr(df_spb['degrees'], df_spb[color])
    colors_that_diff_when_warm.append((color, coef, p))
        
colors_that_diff_when_warm.sort(key=lambda x: -x[2])
        
for record in colors_that_diff_when_warm:
    print(f'{record[0]:<8}: coef = %.3f, p = %.3f' % (record[1], record[2]))
      
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in colors_that_diff_when_warm],
           [record[0] for record in colors_that_diff_when_warm],
           edgecolor='black',
           color=[x[0] for x in colors_that_diff_when_warm],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in colors_that_diff_when_warm])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in colors_that_diff_when_warm],
                   fontsize = 15)
plt.xlabel('p')
plt.show()
df_weekends = df[df['weekday'] >= 5]
df_3_10 = df[df['day'] == 3]
df_3_10 = df_3_10[df_3_10['month'] == 5]
df_3_10
df_weekends = pd.concat([df_weekends, df_3_10])
df_weekends
df_workdays = df[df['weekday'] < 5]
df_workdays = df_workdays[df_workdays['month'] == 4]
df_march = df[df['month'] == 5]
df_march = df_march[df_march['day'] > 10]
df_march = df_march[df_march['weekday'] < 5]
df_workdays = pd.concat([df_workdays, df_march])
df_workdays
df_freqs_weekends = df_weekends.groupby('day_number').sum()[colors] / df_weekends.groupby('day_number').count()[colors]
df_freqs_weekends.shape
df_freqs_workdays = df_workdays.groupby('day_number').sum()[colors] / df_workdays.groupby('day_number').count()[colors]
df_freqs_workdays.shape
color_diffs = []
alpha = 0.05
for color in colors:
    stat, p = mannwhitneyu(df_freqs_weekends[color], df_freqs_workdays[color])
    color_diffs.append((color, stat, p))
color_diffs.sort(key=lambda x: -x[2])
for record in color_diffs:
    print(f'{record[0]:<16}: stat = %.3f, p = %.3f' % (record[1], record[2]))
    
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter([record[2] for record in color_diffs],
           [record[0] for record in color_diffs],
           edgecolor='black',
           color=[x[0] for x in color_diffs],
           marker='^',
           s=[100 if x[2] < alpha else 50 for x in color_diffs])
ax.axvline(x=alpha)
ax.set_yticklabels([x[0] for x in color_diffs],
                   fontsize = 15)
plt.xlabel('p')
plt.show()