Playing with Pandas Sudhir
In [65]:
import pandas as pd
In [66]:
!ls -l PastHires.csv
-rwxr-xr-x@ 1 sudhirwadhwa  staff  312 Jun 10  2016 PastHires.csv
In [67]:
PastHires = pd.read_csv("PastHires.csv")
In [40]:
!cat PastHires.csv













In [68]:
type(PastHires)
Out[68]:
pandas.core.frame.DataFrame

Q. What is a DataFrame?

A. DataFrame is a 2 dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet. It has rows and columns.

In [69]:
# show me all the colmuns

PastHires.columns
Out[69]:
Index(['Years Experience', 'Employed?', 'Previous employers',
       'Level of Education', 'Top-tier school', 'Interned', 'Hired'],
      dtype='object')
In [70]:
PastHires.index
Out[70]:
RangeIndex(start=0, stop=13, step=1)
In [71]:
PastHires.values
Out[71]:
array([[10, 'Y', 4, 'BS', 'N', 'N', 'Y'],
       [0, 'N', 0, 'BS', 'Y', 'Y', 'Y'],
       [7, 'N', 6, 'BS', 'N', 'N', 'N'],
       [2, 'Y', 1, 'MS', 'Y', 'N', 'Y'],
       [20, 'N', 2, 'PhD', 'Y', 'N', 'N'],
       [0, 'N', 0, 'PhD', 'Y', 'Y', 'Y'],
       [5, 'Y', 2, 'MS', 'N', 'Y', 'Y'],
       [3, 'N', 1, 'BS', 'N', 'Y', 'Y'],
       [15, 'Y', 5, 'BS', 'N', 'N', 'Y'],
       [0, 'N', 0, 'BS', 'N', 'N', 'N'],
       [1, 'N', 1, 'PhD', 'Y', 'N', 'N'],
       [4, 'Y', 1, 'BS', 'N', 'Y', 'Y'],
       [0, 'N', 0, 'PhD', 'Y', 'N', 'Y']], dtype=object)
In [45]:
# Show me top 5
PastHires.head()
Out[45]:
Years Experience Employed? Previous employers Level of Education Top-tier school Interned Hired
0 10 Y 4 BS N N Y
1 0 N 0 BS Y Y Y
2 7 N 6 BS N N N
3 2 Y 1 MS Y N Y
4 20 N 2 PhD Y N N
In [72]:
# Show me top 9
PastHires.head(10)
Out[72]:
Years Experience Employed? Previous employers Level of Education Top-tier school Interned Hired
0 10 Y 4 BS N N Y
1 0 N 0 BS Y Y Y
2 7 N 6 BS N N N
3 2 Y 1 MS Y N Y
4 20 N 2 PhD Y N N
5 0 N 0 PhD Y Y Y
6 5 Y 2 MS N Y Y
7 3 N 1 BS N Y Y
8 15 Y 5 BS N N Y
9 0 N 0 BS N N N
In [48]:
#Show me last row in the dataframe?
PastHires.tail(1)
Out[48]:
Years Experience Employed? Previous employers Level of Education Top-tier school Interned Hired
12 0 N 0 PhD Y N Y
In [49]:
# Show me all the values
PastHires.values
Out[49]:
array([[10, 'Y', 4, 'BS', 'N', 'N', 'Y'],
       [0, 'N', 0, 'BS', 'Y', 'Y', 'Y'],
       [7, 'N', 6, 'BS', 'N', 'N', 'N'],
       [2, 'Y', 1, 'MS', 'Y', 'N', 'Y'],
       [20, 'N', 2, 'PhD', 'Y', 'N', 'N'],
       [0, 'N', 0, 'PhD', 'Y', 'Y', 'Y'],
       [5, 'Y', 2, 'MS', 'N', 'Y', 'Y'],
       [3, 'N', 1, 'BS', 'N', 'Y', 'Y'],
       [15, 'Y', 5, 'BS', 'N', 'N', 'Y'],
       [0, 'N', 0, 'BS', 'N', 'N', 'N'],
       [1, 'N', 1, 'PhD', 'Y', 'N', 'N'],
       [4, 'Y', 1, 'BS', 'N', 'Y', 'Y'],
       [0, 'N', 0, 'PhD', 'Y', 'N', 'Y']], dtype=object)
In [50]:
# Just show me who is at location 3
PastHires.iloc[3]
Out[50]:
Years Experience       2
Employed?              Y
Previous employers     1
Level of Education    MS
Top-tier school        Y
Interned               N
Hired                  Y
Name: 3, dtype: object
In [51]:
PastHires.iloc[12]
Out[51]:
Years Experience        0
Employed?               N
Previous employers      0
Level of Education    PhD
Top-tier school         Y
Interned                N
Hired                   Y
Name: 12, dtype: object
In [73]:
# This will give you IndexError: 
# single positional indexer is out-of-bounds

PastHires.iloc[13]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-73-016396b3ac30> in <module>()
      2 # single positional indexer is out-of-bounds
      3 
----> 4 PastHires.iloc[13]

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1326         else:
   1327             key = com._apply_if_callable(key, self.obj)
-> 1328             return self._getitem_axis(key, axis=0)
   1329 
   1330     def _is_scalar_access(self, key):

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1747 
   1748             # validate the location
-> 1749             self._is_valid_integer(key, axis)
   1750 
   1751             return self._get_loc(key, axis=axis)

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _is_valid_integer(self, key, axis)
   1636         l = len(ax)
   1637         if key >= l or key < -l:
-> 1638             raise IndexError("single positional indexer is out-of-bounds")
   1639         return True
   1640 

IndexError: single positional indexer is out-of-bounds
In [53]:
# What are the Data Types in the given Data Frame?
PastHires.dtypes
Out[53]:
Years Experience       int64
Employed?             object
Previous employers     int64
Level of Education    object
Top-tier school       object
Interned              object
Hired                 object
dtype: object
In [74]:
PastHires.shape
Out[74]:
(13, 7)
In [54]:
# Count the Data types.
PastHires.get_dtype_counts()
Out[54]:
int64     2
object    5
dtype: int64
In [55]:
PastHires.columns
Out[55]:
Index(['Years Experience', 'Employed?', 'Previous employers',
       'Level of Education', 'Top-tier school', 'Interned', 'Hired'],
      dtype='object')
In [56]:
# Just show me Level of Education in my Data frame
PastHires["Level of Education"]
Out[56]:
0      BS
1      BS
2      BS
3      MS
4     PhD
5     PhD
6      MS
7      BS
8      BS
9      BS
10    PhD
11     BS
12    PhD
Name: Level of Education, dtype: object
In [58]:
phds = PastHires[(PastHires['Level of Education'] == "PhD")]
In [59]:
phds
Out[59]:
Years Experience Employed? Previous employers Level of Education Top-tier school Interned Hired
4 20 N 2 PhD Y N N
5 0 N 0 PhD Y Y Y
10 1 N 1 PhD Y N N
12 0 N 0 PhD Y N Y
In [75]:
phdWhoGotHired = PastHires[(PastHires['Level of Education'] == "PhD") \
                          & (PastHires['Hired'] == "Y") ]
In [61]:
phdWhoGotHired
Out[61]:
Years Experience Employed? Previous employers Level of Education Top-tier school Interned Hired
5 0 N 0 PhD Y Y Y
12 0 N 0 PhD Y N Y