In this presentation we will investigate 110,000 medical appointments in Brazil & its 14 associated variables
Possibly pointing out any correlation with not showing up cases for appointments.
And also answer some of the questions regarding this data set.
However after cleaning, they were reduced to around 67,000 records.
counts,idx,txt = bar_data(med,'no_show')
fig=plt.figure(figsize=(7,5))
sns.countplot(data=med ,x='no_show' ,color=base_color,order=med.no_show.value_counts().index)
#plt.bar(data=med,x=idx,height=counts.values,width=0.2,align='center')
[plt.text(i, counts[idx[i]],txt[i], ha='center',va='bottom',size=15) for i in range(len(counts))];
plt.xticks([0,1],['Show','No Show'])
plt.title("20% of the appointments, patients didn't show up",fontsize=15,y=1.05)
plt.xlabel('')
sns.despine()
counts,idx,txt = bar_data(med,'age_group')
colors = [base_color if (value == counts.max()) else '#A9A9A9' for item,value in counts.items() ]
plt.figure(figsize=(12,7))
sns.countplot(data=med,x='age_group',palette=colors);
[plt.text(i, counts[idx[i]],txt[i], ha='center',va='bottom',size=13) for i in range(len(counts))];
plt.xlabel('Age group',fontsize=13)
plt.ylabel('Count',fontsize=13)
plt.title('Young Adults represents the majority of patients',fontsize=15,y=1.05)
sns.despine()
fig=plt.figure(figsize=(12,7))
ax = fig.add_subplot(1,1,1)
ax.grid(zorder=0)
ax.xaxis.grid(color='gray', linestyle='-')
g=sns.lineplot(data=med, x='age',y='no_show_bin',lw=4,zorder=3,hue='age_group',
hue_order=['Baby 0-2','Teen 3-15','Young Adults 16-39','Middle-aged 40-59', 'Elderly 60+'],
palette=sns.cubehelix_palette(5,start=2.5,rot=0.2,hue=1))
plt.xlim(0, 80)
plt.title('Older patients are less likely not to show up than younger ones',size=15,y=1.05);
plt.ylabel('Probability\nOf not attending\nAn Appointment',size=13,rotation=0,va='top',ha='left',labelpad=90);
plt.yticks(plt.yticks()[0],pd.DataFrame(["{:.0%}".format(yy) for yy in plt.yticks()[0]])[0].values)
plt.xlabel('Age',size=13);
plt.ylim(0,0.4);
plt.legend(title=False,frameon=False,prop={'size':12})
sns.despine()
counts,idx,txt = bar_data(med,'sc_hr')
fig = plt.figure(figsize=(12,7))
colors = [base_color if (hour in [7,13]) else '#A9A9A9' for hour,value in counts.items() ]
g = sns.countplot(data=med ,x='sc_hr',palette=colors)
[plt.text(i, counts[idx[i]],txt[i], ha='center',va='bottom',size=13
) for i in range(len(counts))];
# plt.xticks(np.arange(0,7,1),week_names);
plt.xlabel('Schedueling Hour',size=13);
plt.ylabel('Frequency',size=13);
plt.title("Peaks observed at 7AM and 1PM when patients book appointments",size=15);
sns.despine()
Scheduling rendezvous peaks at 7AM and gradually decreases till 12PM ,Peaks again at 1PM & 2PM and gradually decreases again till 10PM
def minu(t):
t=str(t).split('.')
if t[1]== '5':
t[1]='30'
else:
t[1]='00'
if int(t[0]) < 12:
out= t[0]+':'+t[1]+" AM"
else:
out= t[0]+':'+t[1]+" PM"
return out
fig=plt.figure(figsize=(12,7))
sns.kdeplot(data=med, y="sc_hr", x="age", fill=True, thresh=0, levels=100, cmap="mako");
plt.xlabel('Age',size=13);
plt.xticks(np.arange(0,102+8,8),np.arange(0,102+8,8))
plt.title('Around 7:00 AM, scheduled appointments for patients younger than 8 years old,\nbetween 50 and 56 years old are more frequent',y=1.05,size=15)
plt.ylabel('Schedule \nHour',size=13,rotation=0,va='center',labelpad=30);
plt.yticks(plt.yticks()[0],pd.DataFrame([minu(yy) for yy in plt.yticks()[0]])[0].values);
plt.ylim(5,20)
plt.xlim(-5,100);
Concentrations observed around 7AM schedule time for appointments. they are associated with patients of age less than 8 years old and between 50 and 56 years old
counts,idx,txt=bar_data(med,'avg_re_scds_per_patient')
plt.figure(figsize=(13,7))
colors = [base_color if (index in idx[1:]) else '#A9A9A9' for index in idx ]
g=sns.countplot(data=med ,x='avg_re_scds_per_patient' ,palette=colors)
[plt.text(i, counts[idx[i]],txt[i], ha='center',va='bottom',size=9) for i in range(len(counts))];
plt.yscale('log')
plt.yticks([10,30,100,300,1000,3000,10000,30000],['10','30','100','300','1k','3k','10k','30k'])
plt.xticks(np.arange(0,6+1,1),np.arange(0,6+1,1))
plt.xlabel("Number of average reschedules per day",fontsize=13)
plt.ylabel('Frequency (Log Scale)',fontsize=13);
plt.title("There is a 16% chance that a patient reschedules an appointment at least 1 time per day (Log Scale)",fontsize=15)
sns.despine();
Around 84% of appointments are done using the 1st scheduling engagement!
fig=plt.figure(figsize=(12,7))
sns.pointplot(data=med,x='avg_re_scds_per_patient',y='no_show_bin',ci=False,order=[0,1,2,3,4,5])
plt.xlabel("Average reschedules per day",fontsize=13)
plt.ylabel('Probability\nof not attending\nan appointment',size=13,rotation=0,ha='left',labelpad=106);
plt.title("The more times an appointment is reschedueled per day, the less likely to be attended",size=15,y=1.05);
plt.yticks(plt.yticks()[0],pd.DataFrame(["{:.0%}".format(yy) for yy in plt.yticks()[0]])[0].values)
plt.xticks(np.arange(0,6+1,1),np.arange(0,6+1,1))
sns.despine()
fig=plt.figure(figsize=(12,7))
ax = fig.add_subplot(1,1,1)
ax.grid(zorder=0)
ax.xaxis.grid(color='#A9A9A9', linestyle='-')
bins = np.arange(0, med['waiting_days'].max()+1, 1)
freq, bins, patches =plt.hist(data=med, x='waiting_days', bins=bins,edgecolor='black',zorder=3);
bin_centers = np.diff(bins)*0.5 + bins[:-1]
plt.yscale('log')
patches[20].set_fc('#A9A9A9')
for n,(fr, x, patch) in enumerate(zip(freq, bin_centers, patches)):
patches[n].set_fc('#A9A9A9')
height = int(freq[n])
plt.annotate("{:.0%}".format(height/len(med)),xy = (x, height),xytext = (0,0.2),textcoords = "offset points",
ha = 'center', va = 'bottom')
patches[0].set_fc(base_color)
plt.xticks(np.arange(0.5,med['waiting_days'].max()+1.5,1),np.arange(0,med['waiting_days'].max()+1,1));
plt.ylabel('Frequency (log scale)');
plt.yticks([10,30,100,300,1000,3000,10000,30000],['10','30','100','300','1k','3k','10k','30k']);
plt.ylabel('Frequency (log scale)',size=13);
plt.xlabel('Days till Rendezvous',size=13);
plt.title('35% of the appointments are scheduled to be in the same day (Log Scale)',y=1.05,size=15);
plt.xlim(-1,37.25)
sns.despine()
However, There are peaks existing among waiting days.
wd_counts = med.waiting_days.value_counts().sort_index()
highlights = med.waiting_days.value_counts()[[7,14,21,28,35]]
week_txt= [f"{int(ind/7)} weeks" for ind in highlights.index ]
week_txt[0]='1 week'
week= pd.Series(week_txt,index=[7,14,21,28,35])
colors = [base_color if (index in highlights.index) else '#A9A9A9' for index,value in wd_counts.items() ]
counts,idx,txt = bar_data(med,'waiting_days')
fig = plt.figure(figsize=(12,7))
g = sns.countplot(data=med ,x='waiting_days' ,palette=colors)
[plt.text(index,value+100,week[index], ha='center',va='bottom',size=12) for index,value in highlights.items() ]
plt.yscale('log')
plt.yticks([10,30,100,300,1000,3000,10000],['10','30','100','300','1k','3k','10k']);
plt.xticks(np.arange(idx.min(),idx.max()+7,7),np.arange(idx.min(),idx.max()+7,7));
plt.ylabel('Frequency (log scale)',size=13);
plt.xlabel('Days till Rendezvous',size=13);
plt.title('Tendancy to schedule appointments by weekly basis (Log Scale)',size=15);
plt.xlim(-0.5,37.5)
sns.despine()
The peaks represent exactly multiples of 7 days, depicting the tendancy to schedule appointments by weekly basis.
week_names=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
data=med.groupby(['sc_dow','ap_dow']).size().reset_index(name='count')
data_pivot=data.pivot(index='sc_dow',columns='ap_dow',values='count')
plt.figure(figsize=[12,7])
sns.heatmap(data_pivot,annot=True,fmt='d',cmap='YlGnBu',vmin=0)
plt.xticks(np.arange(0.5,5.5+1,1),week_names[:-1])
plt.yticks(np.arange(0.5,5.5+1,1),week_names[:-1],va='center',rotation=0)
plt.ylabel('Schedule\nDay',size=13,rotation=0,ha='center',va='center',labelpad=35);
plt.xlabel('Appointment Day',size=13)
plt.title('Tendancy of scheduling appointments by weekly basis',y=1.05,size=15);
The diagonal has the highest counts, the heat map depicts the tendancy to schedule appointments by weekly basis, as shown before in the histogram of waiting days.
fig =plt.figure(figsize=(12,7))
ax = fig.add_subplot(1,1,1)
ax.grid(zorder=0)
ax.xaxis.grid(color='gray', linestyle='-')
sns.lineplot(data=med, x='waiting_days',y='no_show_bin',lw=4,ax=ax,zorder=3)
plt.ylabel('Probability\nOf not attending\nAn Appointment',size=13,rotation=0,ha='left',va='bottom',labelpad=110);
plt.xlabel('Days till Appointment',size=13)
plt.yticks(plt.yticks()[0],pd.DataFrame(["{:.0%}".format(yy) for yy in plt.yticks()[0]])[0].values)
plt.title('The longer the waiting time for an appointment, the more likely patients not turning up',size=15,y=1.05)
sns.despine()
plt.legend(['_','95% confidence interval region'],frameon=False,prop={'size':12},loc='best');