In [1]:
import pandas as pd
import numpy as np
import operator

import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *
In [2]:
init_notebook_mode()
In [3]:
def sliding_window(data_array, window=2):
    length = len(data_array)
    new_list = []
    for i in range(length):
        indices = range(max(i - window, 0),
                        min(i + window + 1, length))
        avg = 0
        for j in indices:
            avg += data_array[j]
        avg /= float(len(indices))
        new_list.append(avg)

    return np.array(new_list)
In [4]:
color=dict({'colors': ['rgb(31, 119, 180)', 'rgb(174, 199, 232)', 'rgb(255, 127, 14)', 'rgb(255, 187, 120)',
             'rgb(44, 160, 44)', 'rgb(152, 223, 138)', 'rgb(214, 39, 40)', 'rgb(255, 152, 150)',
             'rgb(148, 103, 189)', 'rgb(197, 176, 213)', 'rgb(140, 86, 75)', 'rgb(196, 156, 148)',
             'rgb(227, 119, 194)', 'rgb(247, 182, 210)', 'rgb(127, 127, 127)', 'rgb(199, 199, 199)',
             'rgb(188, 189, 34)', 'rgb(219, 219, 141)', 'rgb(23, 190, 207)', 'rgb(158, 218, 229)']})

"Big Data from Pharmaceutical Patents: A Computational Analysis of Medicinal Chemists’ Bread and Butter"

by N. Schneider, D. M. Lowe, R. A. Sayle, M. A. Tarselli, G. A. Landrum

In [5]:
data = pd.DataFrame.from_csv('EvolutionMajorRXNClassesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Evolution of major reaction classes over time                         .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [6]:
data = pd.DataFrame.from_csv('Heteroatom_alkylation_and_arylation_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Evolution of Heteroatom akylation and arylation over time                   .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [7]:
data = pd.DataFrame.from_csv('Acylation_and_related_processes_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Evolution of Acylation and related processes over time                         .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [8]:
data = pd.DataFrame.from_csv('Deprotections_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Evolution of Deprotections over time                              .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [9]:
data = pd.DataFrame.from_csv('C-C_bond_formation_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Evolution of C-C bond formation over time                               .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [10]:
data = pd.DataFrame.from_csv('Heterocycle_formation_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Heterocycle formation over time                                .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [11]:
data = pd.DataFrame.from_csv('Protections_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Protections over time                                  .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [12]:
data = pd.DataFrame.from_csv('Oxidations_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Oxidations over time                                  .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [13]:
data = pd.DataFrame.from_csv('Reductions_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    title='Reductions over time                                  .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [14]:
data = pd.DataFrame.from_csv('Functional_group_interconversion_(FGI)_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    legend=dict(
        x=1000,
        y=1
    ),
    title='Functional group interconversion over time                       .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
In [15]:
data = pd.DataFrame.from_csv('Functional_group_addition_(FGA)_RXNTypesOverTime.csv')
names=list(data)
years=np.array(data[names[0]])

d=[]
for i,k in enumerate(names[1:]):
    occ=sliding_window(np.array(data[k]), window=2)
    d.append(Scatter(x = years, y = occ, mode = 'lines', name = k, line=dict(color=color["colors"][i])))

layout = Layout(
    height=400,
    width=900,
    margin=Margin(
        l=150,
        r=150,
        b=50,
        t=50,
        pad=4
    ),
    legend=dict(
        x=1000,
        y=1,
        xanchor="left"
    ),
    title='Functional group addition over time                         .',
    yaxis=dict(zeroline=False,title='Percentage per year'),
    xaxis=dict(showline=True)
)
fig = Figure(data=d, layout=layout)
iplot(fig)
Drawing...
Reprinted (adapted) with permission from:
Schneider, N., Lowe, D. M., Sayle, R. A., Tarselli, M. A. & Landrum, G. A. Big Data from Pharmaceutical Patents: A Computational Analysis of Medicinal Chemists’ Bread and Butter. J. Med. Chem. (2016). http://pubs.acs.org/doi/abs/10.1021/acs.jmedchem.6b00153
Copyright 2016 American Chemical Society.