I am getting "time out" errors when trying to read from a large CSV (2GB) in S3:
df = pd.read_csv('path_to_2GB_file.csv')
The file is hosted privately so unfortunately can't make it accessible. I'll be happy to try reading from an open/public S3 repository if anyone knows where to get CSVs of a similar size. Unfortunately I couldn't find any on a quick search.
Output
TimeoutError Traceback (most recent call last)
<ipython-input-4-0470fac7215b> in <module>()
----> 1 winsdf = pd.read_csv('s3://dxusers-useast1/sys-opt/flight_creative_exchange_spend_hour/2016-10-10.csv')
/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squ
eeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose,
skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quote
char, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarr
ay, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
643 skip_blank_lines=skip_blank_lines)
644
--> 645 return _read(filepath_or_buffer, kwds)
646
647 parser_f.__name__ = name
/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
398 return parser
399
--> 400 data = parser.read()
401 parser.close()
402 return data
/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
936 raise ValueError('skipfooter not supported for iteration')
937
--> 938 ret = self._engine.read(nrows)
939
940 if self.options.get('as_recarray'):
/Users/amelio/anaconda/envs/py35/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
1503 def read(self, nrows=None):
1504 try:
-> 1505 data = self._reader.read(nrows)
1506 except StopIteration:
1507 if self._first_chunk:
pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:9884)()
pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:10142)()
pandas/parser.pyx in pandas.parser.TextReader._read_rows (pandas/parser.c:10870)()
pandas/parser.pyx in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:10741)()
pandas/parser.pyx in pandas.parser.raise_parser_error (pandas/parser.c:25721)()
/Users/amelio/anaconda/envs/py35/lib/python3.5/socket.py in readinto(self, b)
573 while True:
574 try:
--> 575 return self._sock.recv_into(b)
576 except timeout:
577 self._timeout_occurred = True
/Users/amelio/anaconda/envs/py35/lib/python3.5/ssl.py in recv_into(self, buffer, nbytes, flags)
927 "non-zero flags not allowed in calls to recv_into() on %s" %
928 self.__class__)
--> 929 return self.read(nbytes, buffer)
930 else:
931 return socket.recv_into(self, buffer, nbytes, flags)
/Users/amelio/anaconda/envs/py35/lib/python3.5/ssl.py in read(self, len, buffer)
789 raise ValueError("Read on closed or unwrapped SSL socket.")
790 try:
--> 791 return self._sslobj.read(len, buffer)
792 except SSLError as x:
793 if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:
/Users/amelio/anaconda/envs/py35/lib/python3.5/ssl.py in read(self, len, buffer)
573 """
574 if buffer is not None:
--> 575 v = self._sslobj.read(len, buffer)
576 else:
577 v = self._sslobj.read(len)
TimeoutError: [Errno 60] Operation timed out
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit: None
python: 3.5.2.final.0
python-bits: 64
OS: Darwin
OS-release: 16.1.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: en_US.UTF-8
LANG: en_US.UTF-8
LOCALE: en_US.UTF-8
pandas: 0.19.0
nose: 1.3.7
pip: 8.1.2
setuptools: 26.1.1.post20160901
Cython: 0.24.1
numpy: 1.11.2
scipy: 0.18.1
statsmodels: 0.6.1
xarray: None
IPython: 5.1.0
sphinx: 1.4.8
patsy: 0.4.1
dateutil: 2.5.3
pytz: 2016.7
blosc: None
bottleneck: 1.1.0
tables: 3.3.0
numexpr: 2.6.1
matplotlib: 1.5.3
openpyxl: 2.4.0
xlrd: 1.0.0
xlwt: 1.1.2
xlsxwriter: 0.9.3
lxml: 3.6.4
bs4: 4.5.1
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: 1.1.1
pymysql: None
psycopg2: 2.6.2 (dt dec pq3 ext lo64)
jinja2: 2.8
boto: 2.42.0
pandas_datareader: None
Comment From: jreback
an easy option with s3 is to use blocked reads, like this package s3fs; this uses boto3
under the hood and pandas will be using at some point.