I am getting "time out" errors when trying to read from a large CSV (2GB) in S3:

df = pd.read_csv('path_to_2GB_file.csv')

The file is hosted privately so unfortunately can't make it accessible. I'll be happy to try reading from an open/public S3 repository if anyone knows where to get CSVs of a similar size. Unfortunately I couldn't find any on a quick search.

Output

TimeoutError                              Traceback (most recent call last)
<ipython-input-4-0470fac7215b> in <module>()
----> 1 winsdf = pd.read_csv('s3://dxusers-useast1/sys-opt/flight_creative_exchange_spend_hour/2016-10-10.csv')

/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squ
eeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose,
 skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quote
char, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarr
ay, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)                                                                                 
    643                     skip_blank_lines=skip_blank_lines)
    644 
--> 645         return _read(filepath_or_buffer, kwds)
    646 
    647     parser_f.__name__ = name

/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    398         return parser
    399 
--> 400     data = parser.read()
    401     parser.close()
    402     return data

/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
    936                 raise ValueError('skipfooter not supported for iteration')
    937 
--> 938         ret = self._engine.read(nrows)
    939 
    940         if self.options.get('as_recarray'):

/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
   1503     def read(self, nrows=None):
   1504         try:
-> 1505             data = self._reader.read(nrows)
   1506         except StopIteration:
   1507             if self._first_chunk:

pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:9884)()

pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:10142)()

pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:10870)()

pandas/parser.pyx in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:10741)()

pandas/parser.pyx in pandas.parser.raise_parser_error (pandas/parser.c:25721)()

/Users/amelio/anaconda/envs/py35/lib/python3.5/socket.py in readinto(self, b)
    573         while True:
    574             try:
--> 575                 return self._sock.recv_into(b)
    576             except timeout:
    577                 self._timeout_occurred = True

/Users/amelio/anaconda/envs/py35/lib/python3.5/ssl.py in recv_into(self, buffer, nbytes, flags)
    927                   "non-zero flags not allowed in calls to recv_into() on %s" %
    928                   self.__class__)
--> 929             return self.read(nbytes, buffer)
    930         else:
    931             return socket.recv_into(self, buffer, nbytes, flags)

/Users/amelio/anaconda/envs/py35/lib/python3.5/ssl.py in read(self, len, buffer)
    789             raise ValueError("Read on closed or unwrapped SSL socket.")
    790         try:
--> 791             return self._sslobj.read(len, buffer)
    792         except SSLError as x:
    793             if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:

/Users/amelio/anaconda/envs/py35/lib/python3.5/ssl.py in read(self, len, buffer)
    573         """
    574         if buffer is not None:
--> 575             v = self._sslobj.read(len, buffer)
    576         else:
    577             v = self._sslobj.read(len)

TimeoutError: [Errno 60] Operation timed out

Output of pd.show_versions()

INSTALLED VERSIONS
------------------
commit: None
python: 3.5.2.final.0
python-bits: 64
OS: Darwin
OS-release: 16.1.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: en_US.UTF-8
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8

pandas: 0.19.0
nose: 1.3.7
pip: 8.1.2
setuptools: 26.1.1.post20160901
Cython: 0.24.1
numpy: 1.11.2
scipy: 0.18.1
statsmodels: 0.6.1
xarray: None
IPython: 5.1.0
sphinx: 1.4.8
patsy: 0.4.1
dateutil: 2.5.3
pytz: 2016.7
blosc: None
bottleneck: 1.1.0
tables: 3.3.0
numexpr: 2.6.1
matplotlib: 1.5.3
openpyxl: 2.4.0
xlrd: 1.0.0
xlwt: 1.1.2
xlsxwriter: 0.9.3
lxml: 3.6.4
bs4: 4.5.1
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: 1.1.1
pymysql: None
psycopg2: 2.6.2 (dt dec pq3 ext lo64)
jinja2: 2.8
boto: 2.42.0
pandas_datareader: None

Comment From: jreback

an easy option with s3 is to use blocked reads, like this package s3fs; this uses boto3 under the hood and pandas will be using at some point.