Code Sample, a copy-pastable example if possible

data = '/Users/davidleifer/Desktop/Geog500/thesis/data/merged-file.json'
df = pd.read_json(data, lines=True)

Problem description

The JSON file contains Twitter data scraped using their API. I've limited the files to 10,000 tweets per file. I clean the files using this process:

1) Merge files in directory using: cat * > merged-file.json 2) Remove blank lines in Sublime Text using Find and Replace: ^\n.

Here is an example Tweet (one tweet per line):

{"created_at":"Thu Nov 02 08:08:01 +0000 2017","id":925997914136002562,"id_str":"925997914136002562","text":"#RussianGate #FollowTheFacts #Resist #FakePresident #GOP #War #Vote #ClimateChange #Peace #Animals #Women https:\/\/t.co\/xe7AEdod1Y","display_text_range":[0,105],"source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":760436942,"id_str":"760436942","name":"Athoughtz","screen_name":"athoughtz","location":"United States","url":null,"description":"#RussianGate #FollowTheFacts #Resist #FakePresident #GOP #War #Vote #ClimateChange #Peace #Animals #Women","translator_type":"none","protected":false,"verified":false,"followers_count":5063,"friends_count":5064,"listed_count":142,"favourites_count":659,"statuses_count":62057,"created_at":"Thu Aug 16 00:11:12 +0000 2012","utc_offset":-25200,"time_zone":"Arizona","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000835488491\/565d1bd43c8b0a615b8a39887e52ef2c_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000835488491\/565d1bd43c8b0a615b8a39887e52ef2c_normal.jpeg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"RussianGate","indices":[0,12]},{"text":"FollowTheFacts","indices":[13,28]},{"text":"Resist","indices":[29,36]},{"text":"FakePresident","indices":[37,51]},{"text":"GOP","indices":[52,56]},{"text":"War","indices":[57,61]},{"text":"Vote","indices":[62,67]},{"text":"ClimateChange","indices":[68,82]},{"text":"Peace","indices":[83,89]},{"text":"Animals","indices":[90,98]},{"text":"Women","indices":[99,105]}],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":925997885778378752,"id_str":"925997885778378752","indices":[106,129],"media_url":"http:\/\/pbs.twimg.com\/media\/DNnOK8SVQAAUS6Z.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DNnOK8SVQAAUS6Z.jpg","url":"https:\/\/t.co\/xe7AEdod1Y","display_url":"pic.twitter.com\/xe7AEdod1Y","expanded_url":"https:\/\/twitter.com\/athoughtz\/status\/925997914136002562\/photo\/1","type":"photo","sizes":{"medium":{"w":600,"h":585,"resize":"fit"},"small":{"w":600,"h":585,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":600,"h":585,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":925997885778378752,"id_str":"925997885778378752","indices":[106,129],"media_url":"http:\/\/pbs.twimg.com\/media\/DNnOK8SVQAAUS6Z.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DNnOK8SVQAAUS6Z.jpg","url":"https:\/\/t.co\/xe7AEdod1Y","display_url":"pic.twitter.com\/xe7AEdod1Y","expanded_url":"https:\/\/twitter.com\/athoughtz\/status\/925997914136002562\/photo\/1","type":"photo","sizes":{"medium":{"w":600,"h":585,"resize":"fit"},"small":{"w":600,"h":585,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":600,"h":585,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"und","timestamp_ms":"1509610081596"} {"created_at":"Thu Nov 02 08:08:02 +0000 2017","id":925997918795866113,"id_str":"925997918795866113","text":"RT @CGTNOfficial: Survey released on Chinese public awareness of #climatechange https:\/\/t.co\/q92jAnobmd","source":"\u003ca href=\"http:\/\/nosudo.co\" rel=\"nofollow\"\u003eQxNews-python\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1664059166,"id_str":"1664059166","name":"Question News","screen_name":"QxNews","location":"USA","url":null,"description":"Interrogare Semper | News bot\/humans via retweets | 1 min per retweet","translator_type":"none","protected":false,"verified":false,"followers_count":3254,"friends_count":271,"listed_count":2786,"favourites_count":38,"statuses_count":1018592,"created_at":"Mon Aug 12 03:35:37 +0000 2013","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/514662332492816384\/TuhAkn7d.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/514662332492816384\/TuhAkn7d.jpeg","profile_background_tile":false,"profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/597288578092240896\/ePlmSYCH_normal.png","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/597288578092240896\/ePlmSYCH_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1664059166\/1484679111","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Nov 02 07:55:00 +0000 2017","id":925994638019825664,"id_str":"925994638019825664","text":"Survey released on Chinese public awareness of #climatechange https:\/\/t.co\/q92jAnobmd","source":"\u003ca href=\"https:\/\/about.twitter.com\/products\/tweetdeck\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1115874631,"id_str":"1115874631","name":"CGTN","screen_name":"CGTNOfficial","location":"Beijing, China","url":"http:\/\/www.CGTN.com","description":"China Global Television Network, or CGTN, is a multi-language, multi-platform media grouping.","translator_type":"none","protected":false,"verified":true,"followers_count":4828619,"friends_count":53,"listed_count":4517,"favourites_count":32,"statuses_count":39079,"created_at":"Thu Jan 24 03:18:59 +0000 2013","utc_offset":28800,"time_zone":"Beijing","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/378800000169084583\/SqpyvnvQ.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/378800000169084583\/SqpyvnvQ.jpeg","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/815049165508112384\/wJA8jWZh_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/815049165508112384\/wJA8jWZh_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1115874631\/1483157766","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":10,"favorite_count":25,"entities":{"hashtags":[{"text":"climatechange","indices":[47,61]}],"urls":[{"url":"https:\/\/t.co\/q92jAnobmd","expanded_url":"https:\/\/news.cgtn.com\/news\/794d7a4e33597a6333566d54\/share_p.html","display_url":"news.cgtn.com\/news\/794d7a4e3\u2026","indices":[62,85]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"climatechange","indices":[65,79]}],"urls":[{"url":"https:\/\/t.co\/q92jAnobmd","expanded_url":"https:\/\/news.cgtn.com\/news\/794d7a4e33597a6333566d54\/share_p.html","display_url":"news.cgtn.com\/news\/794d7a4e3\u2026","indices":[80,103]}],"user_mentions":[{"screen_name":"CGTNOfficial","name":"CGTN","id":1115874631,"id_str":"1115874631","indices":[3,16]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1509610082707"}

I get this error:


OSError Traceback (most recent call last) in () ----> 1 df = pd.read_json(data, lines=True)

/Users/davidleifer/anaconda/lib/python3.5/site-packages/pandas/io/json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines) 214 if exists: 215 with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: --> 216 json = fh.read() 217 else: 218 json = filepath_or_buffer

OSError: [Errno 22] Invalid argument

Expected Output

Loading the JSON into a pandas dataframe.

Output of pd.show_versions()

INSTALLED VERSIONS

commit: None python: 3.5.2.final.0 python-bits: 64 OS: Darwin OS-release: 16.7.0 machine: x86_64 processor: i386 byteorder: little LC_ALL: None LANG: en_US.UTF-8 LOCALE: en_US.UTF-8

pandas: 0.19.0 nose: 1.3.7 pip: 9.0.1 setuptools: 36.2.7 Cython: 0.24 numpy: 1.13.2 scipy: 0.19.1 statsmodels: 0.6.1 xarray: None IPython: 4.2.0 sphinx: 1.4.1 patsy: 0.4.1 dateutil: 2.5.3 pytz: 2016.4 blosc: None bottleneck: 1.1.0 tables: 3.3.0 numexpr: 2.6.2 matplotlib: 1.5.1 openpyxl: 2.3.2 xlrd: 1.0.0 xlwt: 1.1.2 xlsxwriter: 0.9.2 lxml: 3.6.0 bs4: None html5lib: 0.999999999 httplib2: 0.9.2 apiclient: 1.5.1 sqlalchemy: 1.0.13 pymysql: None psycopg2: 2.6.2 (dt dec pq3 ext lo64) jinja2: 2.8 boto: 2.48.0 pandas_datareader: None

Comment From: gfyoung

@DavidLeifer : Thanks for reporting this! A couple of things:

  • Your pandas version is very old. Try upgrading first to 0.21.0 and see whether that solves your issue.
  • If it does not, could you upload the JSON file instead of copy / pasting the contents in the issue?

Comment From: ghost

Okay, I've attached a zip of the JSON file. I have a directory of 41 of these JSON files that I merge together with the aforementioned command (2.6 GB all together). Loading some of these individual JSON files works with pd.read_json() but not when they are all merged together.

How do you upgrade pandas?

energy20171102T030802.txt.zip

Comment From: gfyoung

If you're using pip, then do pip install --upgrade pandas OR if you're using conda, then do conda upgrade pandas

Comment From: ghost

I have upgraded pandas and tried again. This is the error now:


OSError Traceback (most recent call last) in () ----> 1 df = pd.read_json(data, lines=True)

/Users/davidleifer/anaconda/lib/python3.5/site-packages/pandas/io/json/json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression) 358 keep_default_dates=keep_default_dates, numpy=numpy, 359 precise_float=precise_float, date_unit=date_unit, encoding=encoding, --> 360 lines=lines, chunksize=chunksize, compression=compression, 361 ) 362

/Users/davidleifer/anaconda/lib/python3.5/site-packages/pandas/io/json/json.py in init(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression) 402 403 data = self._get_data_from_filepath(filepath_or_buffer) --> 404 self.data = self._preprocess_data(data) 405 406 def _preprocess_data(self, data):

/Users/davidleifer/anaconda/lib/python3.5/site-packages/pandas/io/json/json.py in _preprocess_data(self, data) 413 """ 414 if hasattr(data, 'read') and not self.chunksize: --> 415 data = data.read() 416 if not hasattr(data, 'read') and self.chunksize: 417 data = StringIO(data)

OSError: [Errno 22] Invalid argument

Comment From: gfyoung

You said earlier about some of your files not being readable (before the merge), have you checked that they're not malformed? That's one reason for why you would get this error.

Comment From: ghost

I will try loading each file to see if any are malformed.

Do you know of anything that could validate a large file? All the online validators I could find have file size limits of less than 2 MB.

Comment From: ghost

Prior to merging, each individual file loaded just fine in the pd.read_json(). After merging, I think it combines the first tweet with the last tweet. With no line break after the last tweet, this is probably throwing an error.

Any ideas how to combine 41 files together with a line break after the last line? I would rather not copy/paste 41 files together.

Comment From: ghost

I just tried to merge them with:

for f in *.txt; do (cat "${f}"; echo) >> finalfile.txt; done

This left a blank line break after each file. So I removed all the blank line breaks after that.

Still got the same error. Individually they all work but merged they do not?

Comment From: gfyoung

@DavidLeifer : Try doing it with two files first instead of all 41. That will be a lot easier to handle and debug compared to doing all 41 all at once.

Comment From: ghost

@gfyoung Went through all 41, found 3 files that didn't finish collecting all the way resulting in a split tweet. This seams like the API just shut off mid stream.

Comment From: gfyoung

This seams like the API just shut off mid stream.

That's odd. Not sure why the Twitter API would do that. Hopefully it didn't think you were spamming it :smile:

Comment From: gfyoung

Issue has arisen due to malformed JSON, not a pandas bug. @DavidLeifer : I will close this for now, but if the issue reappears on correctly-formed JSON, let us know.

Comment From: ghost

thank you @gfyoung

Comment From: ozak

I got the same issue opening a 2.6GB json file. Strangely the issue does not occur on linux but on OSX. So I am not sure if this due to differences in OS. I have tried using chunks to load the data on OSX, but that has caused other issues including memory leaks, which have gone above 50GB.

Comment From: ozak

On OSX I was able to load the data by doing

max_records = 1e5
df = pd.read_json(file, lines=True, chunksize=max_records)
filtered_data = pd.DataFrame() # Initialize the dataframe
try:
   for df_chunk in df:
       filtered_data = pd.concat([filtered_data, df_chunk])
except ValueError:
       print ('\nSome messages in the file cannot be parsed')

as suggested here. Interestingly, I never got an error message in this case.

Comment From: devssh

I checked the json, it is a 2GB file, with 300,000 rows. I loaded the first 200,000 rows into a dataframe and the last 100,000 rows in a dataframe successfully. Seems the OS error is not due to malformed json, but due to the size of the file


OSError                                   Traceback (most recent call last)
<ipython-input-150-c616a01fdf9c> in <module>()
----> 1 df = pd.read_json('data2018-06-01.json', lines=True)
      2 display(df.head(10))
      3 df.shape

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines)
    333             fh, handles = _get_handle(filepath_or_buffer, 'r',
    334                                       encoding=encoding)
--> 335             json = fh.read()
    336             fh.close()
    337         else:

OSError: [Errno 22] Invalid argument

Comment From: bnopacheco

@gfyoung I am processing a JSON file with 5.4GB. I am getting the same OSError on mac.

df = pd.read_json(inputFile)

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<timed exec> in <module>()

/usr/local/lib/python3.6/site-packages/pandas/io/json/json.py in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression)
    414         keep_default_dates=keep_default_dates, numpy=numpy,
    415         precise_float=precise_float, date_unit=date_unit, encoding=encoding,
--> 416         lines=lines, chunksize=chunksize, compression=compression,
    417     )
    418 

/usr/local/lib/python3.6/site-packages/pandas/io/json/json.py in __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit, encoding, lines, chunksize, compression)
    464 
    465         data = self._get_data_from_filepath(filepath_or_buffer)
--> 466         self.data = self._preprocess_data(data)
    467 
    468     def _preprocess_data(self, data):

/usr/local/lib/python3.6/site-packages/pandas/io/json/json.py in _preprocess_data(self, data)
    475         """
    476         if hasattr(data, 'read') and not self.chunksize:
--> 477             data = data.read()
    478         if not hasattr(data, 'read') and self.chunksize:
    479             data = StringIO(data)

OSError: [Errno 22] Invalid argument

Comment From: fercook

Hit the same bug with a proper jsonlines file of 13GB on macOS and Pandas 0.23.0. Please reopen the issue

Comment From: mariskaas

Same bug with pd.to_json from a CSV file. CSV file is only 700mb, I can in fact change it to json the long way, but it gives a slightly different format than I would like. Pandas version is 0.23.4.

Comment From: zc4242

I have the same issue so I have to load the data into MongoDB instead

Comment From: gfyoung

Hello everyone, thanks for all of the follow-up comments! Seems like there is some uncertainty as to whether there's a bug or not.

It seems that you all are running into issues reading large JSON files. Thus, I would ask that you all do the following if you can:

  • Confirm that your JSON files are indeed valid.
  • Does reading it in chunks suffice as a workaround see above? If so, can you share the code that you used to do so?
  • Unfortunately, giant CSV files make it difficult to reproduce (and test), so if you can shrink your datasets as much as possible before the error goes away, that will make things easier for us to evaluate whether we should re-open if necessary.

Comment From: ozak

I have tried the solution above on multiple files and it works ok. I think the problem arises when the file is about 2GB. Haven't tried with other solutions like dask to see whether that solves the issue. And this has only happened on OSX, on Linux it loads directly without any issues. I've only seen this issue opening json, not other formats like csv or dta, even with data over 20GB.

Comment From: PowerToThePeople111

Got the same problem here. Dask gives the same error message by the way. Cause it calls pandas in the end...

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-3-faaf3572c9c3> in <module>
      1 import dask.dataframe as dd
      2 
----> 3 dd.read_json('XXXXXXXXXXXXXX/Downloads/raw-data.json')

/anaconda3/envs/FunkyStuff/lib/python3.6/site-packages/dask/dataframe/io/json.py in read_json(url_path, orient, lines, storage_options, blocksize, sample, encoding, errors, compression, **kwargs)
    161         parts = [dask.delayed(read_json_file)(f, orient, lines, kwargs)
    162                  for f in files]
--> 163     return dd.from_delayed(parts)
    164 
    165 


...



/anaconda3/envs/FunkyStuff/lib/python3.6/site-packages/dask/dataframe/io/json.py in read_json_file(f, orient, lines, kwargs)
    176 def read_json_file(f, orient, lines, kwargs):
    177     with f as f:
--> 178         return pd.read_json(f, orient=orient, lines=lines, **kwargs)


...


/anaconda3/envs/FunkyStuff/lib/python3.6/site-packages/pandas/io/json/json.py in _preprocess_data(self, data)
    475         """
    476         if hasattr(data, 'read') and not self.chunksize:
--> 477             data = data.read()
    478         if not hasattr(data, 'read') and self.chunksize:
    479             data = StringIO(data)

OSError: [Errno 22] Invalid argument

Comment From: stantyan

Got the same problem on OSX with a JSON file 4.7GB. What is the proper way to solve this issue?

Comment From: ozak

See the solution I posted above. So far seems to work ok.

Comment From: jacopofar

Same problem here on macOS, pretty sure is this issue As a workaround one can use a string and read/write by chunking as described here, the problem arises when reading or writing more than 2GB in a single step, but doing it in chunks is fine.