import pandas as pd
import numpy as np
import datetime as dt 
import matplotlib.pyplot as plt
%matplotlib inline

#Read the dataset
df=pd.read_csv('./eda_using_basic_data_functions_in_python_dataset1.csv')

#inspect the first 10 rows.
df.head(10)

df.shape

(3401012, 3)

#Get more information about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3401012 entries, 0 to 3401011
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   date               object
 1   number_of_strikes  int64 
 2   center_point_geom  object
dtypes: int64(1), object(2)
memory usage: 77.8+ MB

#Convert date column to date time
df['date']=pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3401012 entries, 0 to 3401011
Data columns (total 3 columns):
 #   Column             Dtype         
---  ------             -----         
 0   date               datetime64[ns]
 1   number_of_strikes  int64         
 2   center_point_geom  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 77.8+ MB

#Calculate dates with most lightning strikes
df[['date','number_of_strikes']].groupby(['date']).sum().sort_values('number_of_strikes',ascending=False).reset_index().head(10)

#Create a new 'month' column
df['month']=df['date'].dt.month
df.head()

#Calculate total number of strikes per month
df[['month','number_of_strikes']].groupby(['month']).sum().sort_values('number_of_strikes',ascending=False)

#Create a new 'month_txt' column
df['month_txt']=df['date'].dt.month_name().str.slice(stop=3)

df.head()

#Create a new helper dataframe for plotting.
df_by_month=df[['month','month_txt','number_of_strikes']].groupby(['month','month_txt']).sum().sort_values('month',ascending=True).reset_index()

df_by_month

plt.bar(x=df_by_month['month_txt'],
        height=df_by_month['number_of_strikes'],label='Number_of_strikes')
plt.xlabel("Months(2018)")
plt.ylabel("Number of lightning strikes")
plt.title("Number of lightning strikes in 2018 by months")
plt.legend()
plt.show()

#Load the data frame
companies = pd.read_csv('./Unicorn_Companies.csv')

companies.head()

companies.size

10740

companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Company           1074 non-null   object
 1   Valuation         1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1058 non-null   object
 5   Country/Region    1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB

#Get discriptive statistics
companies.describe()

companies['Date Joined']=pd.to_datetime(companies['Date Joined'])

C:\Users\pc\AppData\Local\Temp\ipykernel_7180\1458111891.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  companies['Date Joined']=pd.to_datetime(companies['Date Joined'])

companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Company           1074 non-null   object        
 1   Valuation         1074 non-null   object        
 2   Date Joined       1074 non-null   datetime64[ns]
 3   Industry          1074 non-null   object        
 4   City              1058 non-null   object        
 5   Country/Region    1074 non-null   object        
 6   Continent         1074 non-null   object        
 7   Year Founded      1074 non-null   int64         
 8   Funding           1074 non-null   object        
 9   Select Investors  1073 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 84.0+ KB

companies['Year Joined']=companies['Date Joined'].dt.year

companies.head()

#Sample the data
companies_sample=companies.sample(n=50,random_state=42)

#Prepare data for plotting

#Create new `years_till_unicorn` column
companies_sample['years_till_unicorn']=companies_sample['Year Joined']-companies_sample['Year Founded']

#Group the data by `industry`. For each industry, get the max value in the `years_till_unicorn` column.
grouped =companies_sample[['Industry','years_till_unicorn']].groupby('Industry').sum().sort_values('years_till_unicorn',ascending=True).reset_index()

grouped

#Create the plot

plt.bar(grouped.Industry,grouped.years_till_unicorn)

#Set title
plt.title("Bar plot of maximum years taken by company to become unicorn per industry (from sample)")
#set x-axis label
plt.xlabel("Industry")
#set y-axis label
plt.ylabel("Maximum number of years")
#Rotate labels on the x-axis as a way to avoid overlap in the position of the text
plt.xticks(rotation=45,horizontalalignment='right')

plt.show()

# Create a column representing company valuation as numeric data

# Create new column
companies_sample['valuation_billions'] = companies_sample['Valuation']
# Remove the '$' from each value
companies_sample['valuation_billions'] = companies_sample['valuation_billions'].str.replace('$', '')
# Remove the 'B' from each value
companies_sample['valuation_billions'] = companies_sample['valuation_billions'].str.replace('B', '')
# Convert column to type int
companies_sample['valuation_billions'] = companies_sample['valuation_billions'].astype('int')
companies_sample.head()

# Prepare data for modeling
grouped = (companies_sample[["Industry", "valuation_billions"]]
           .groupby("Industry")
           .max()
           .sort_values(by="valuation_billions")
          )
grouped

# Create bar plot
# with Industry column as the categories of the bars
# and new valuation column as the heights of the bars

### YOUR CODE HERE ###

plt.bar(grouped.index, grouped["valuation_billions"])

# Set title

### YOUR CODE HERE ###

plt.title("Bar plot of maximum unicorn company valuation per industry (from sample)")

# Set x-axis label

### YOUR CODE HERE ###

plt.xlabel("Industry")

# Set y-axis label

### YOUR CODE HERE ###

plt.ylabel("Maximum valuation in billions of dollars")

# Rotate labels on the x-axis as a way to avoid overlap in the positions of the text  

### YOUR CODE HERE ###

plt.xticks(rotation=45, horizontalalignment='right')

# Display the plot

### YOUR CODE HERE ###

plt.show()

	date	number_of_strikes	center_point_geom
0	2018-01-03	194	POINT(-75 27)
1	2018-01-03	41	POINT(-78.4 29)
2	2018-01-03	33	POINT(-73.9 27)
3	2018-01-03	38	POINT(-73.8 27)
4	2018-01-03	92	POINT(-79 28)
5	2018-01-03	119	POINT(-78 28)
6	2018-01-03	35	POINT(-79.3 28)
7	2018-01-03	60	POINT(-79.1 28)
8	2018-01-03	41	POINT(-78.7 28)
9	2018-01-03	119	POINT(-78.6 28)

	date	number_of_strikes
0	2018-08-29	1070457
1	2018-08-17	969774
2	2018-08-28	917199
3	2018-08-27	824589
4	2018-08-30	802170
5	2018-08-19	786225
6	2018-08-18	741180
7	2018-08-16	734475
8	2018-08-31	723624
9	2018-08-15	673455

	date	number_of_strikes	center_point_geom	month
0	2018-01-03	194	POINT(-75 27)	1
1	2018-01-03	41	POINT(-78.4 29)	1
2	2018-01-03	33	POINT(-73.9 27)	1
3	2018-01-03	38	POINT(-73.8 27)	1
4	2018-01-03	92	POINT(-79 28)	1

	number_of_strikes
month
8	15525255
7	8320400
6	6445083
5	4166726
9	3018336
2	2071315
4	1524339
10	1093962
1	860045
3	854168
11	409263
12	312097

	date	number_of_strikes	center_point_geom	month	month_txt
0	2018-01-03	194	POINT(-75 27)	1	Jan
1	2018-01-03	41	POINT(-78.4 29)	1	Jan
2	2018-01-03	33	POINT(-73.9 27)	1	Jan
3	2018-01-03	38	POINT(-73.8 27)	1	Jan
4	2018-01-03	92	POINT(-79 28)	1	Jan

Discovering¶

Part-01¶

Overview¶

Import packages and libraries¶

Convert the date column to datetime¶

Calculate the days with the most strikes¶

Extract the month data¶

Calculate the number of strikes per month¶

Convert the month number to text¶

Create a new dataframe¶

Make a bar chart¶

Part-2¶

Display the first 10 rows of the data¶

Assess the size of the data set¶

Get basic information about the dataset¶

Statistical tests¶

Convert the Date joined column Dtype to datetime data type¶

Create a Year Joined column¶

Result and evaluation¶

Take sample of the data¶

Visualize the time it took companies to reach unicorn status¶

Visualize the maximum unicorn company valuation per industry¶

Considerations¶

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors
0	Bytedance	$180B	4/7/17	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...
1	SpaceX	$100B	12/1/12	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...
2	SHEIN	$100B	7/3/18	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...
3	Stripe	$95B	1/23/14	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG
4	Klarna	$46B	12/12/11	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...

	Year Founded
count	1074.000000
mean	2012.895717
std	5.698573
min	1919.000000
25%	2011.000000
50%	2014.000000
75%	2016.000000
max	2021.000000

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors	Year Joined
0	Bytedance	$180B	2017-04-07	Artificial intelligence	Beijing	China	Asia	2012	$8B	Sequoia Capital China, SIG Asia Investments, S...	2017
1	SpaceX	$100B	2012-12-01	Other	Hawthorne	United States	North America	2002	$7B	Founders Fund, Draper Fisher Jurvetson, Rothen...	2012
2	SHEIN	$100B	2018-07-03	E-commerce & direct-to-consumer	Shenzhen	China	Asia	2008	$2B	Tiger Global Management, Sequoia Capital China...	2018
3	Stripe	$95B	2014-01-23	Fintech	San Francisco	United States	North America	2010	$2B	Khosla Ventures, LowercaseCapital, capitalG	2014
4	Klarna	$46B	2011-12-12	Fintech	Stockholm	Sweden	Europe	2005	$4B	Institutional Venture Partners, Sequoia Capita...	2011

	Industry	years_till_unicorn
0	Consumer & retail	1
1	Auto & transportation	2
2	Artificial intelligence	11
3	Data management & analytics	18
4	Mobile & telecommunications	21
5	Cybersecurity	23
6	Other	26
7	Health	32
8	Supply chain, logistics, & delivery	37
9	E-commerce & direct-to-consumer	38
10	Fintech	57
11	Internet software & services	95

	Company	Valuation	Date Joined	Industry	City	Country/Region	Continent	Year Founded	Funding	Select Investors	Year Joined	years_till_unicorn	valuation_billions
542	Aiven	$2B	2021-10-18	Internet software & services	Helsinki	Finland	Europe	2016	$210M	Institutional Venture Partners, Atomico, Early...	2021	5	2
370	Jusfoun Big Data	$2B	2018-07-09	Data management & analytics	Beijing	China	Asia	2010	$137M	Boxin Capital, DT Capital Partners, IDG Capital	2018	8	2
307	Innovaccer	$3B	2021-02-19	Health	San Francisco	United States	North America	2014	$379M	M12, WestBridge Capital, Lightspeed Venture Pa...	2021	7	3
493	Algolia	$2B	2021-07-28	Internet software & services	San Francisco	United States	North America	2012	$334M	Accel, Alven Capital, Storm Ventures	2021	9	2
350	SouChe Holdings	$3B	2017-11-01	E-commerce & direct-to-consumer	Hangzhou	China	Asia	2012	$1B	Morningside Ventures, Warburg Pincus, CreditEa...	2017	5	3

	valuation_billions
Industry
Auto & transportation	1
Consumer & retail	1
Other	2
Supply chain, logistics, & delivery	2
Cybersecurity	3
Health	3
Data management & analytics	4
E-commerce & direct-to-consumer	4
Internet software & services	5
Mobile & telecommunications	7
Fintech	10
Artificial intelligence	12