import random, pandas
text = [ "one","two","three","four","five","six","seven","eight","nine","ten" ]
data = [ { "name": text[random.randint(0,9)], "number": random.randint(0,99)} \
                        for i in range(0,10000) ]
df = pandas.DataFrame(data)
df.head(n=3)


df.to_csv("flatfile.txt", sep="\t", encoding="utf8", header=True, index=False)
dfr = pandas.read_csv("flatfile.txt", sep="\t", encoding="utf8")
dfr.head(n=3)


datatab = [ {"name": " one\ttab", "number":100 } ] + data
df = pandas.DataFrame(datatab)
df.head(n=3)


df.to_csv("flatfile_tab.txt", sep="\t", encoding="utf8", header=True, index=False)
dfr = pandas.read_csv("flatfile_tab.txt", sep="\t", encoding="utf8")
dfr.head(n=3)


with open("flatfile_tab.txt", "r", encoding="utf8") as f:
    content = f.read()
content = content.replace('"','')
with open("flatfile_tab2.txt", "w", encoding="utf8") as f:
    f.write(content)
dfr = pandas.read_csv("flatfile_tab2.txt", sep="\t", encoding="utf8")
dfr.head(n=3)


datatab = data[:1] + [ {"name": " one\ttab", "number":100 } ] + data[1:]
df = pandas.DataFrame(datatab)
df.to_csv("flatfile_tab_pos2.txt", sep="\t", encoding="utf8", header=True, index=False)
with open("flatfile_tab_pos2.txt","r",encoding="utf8") as f: 
    content = f.read()
content = content.replace('"','')
with open("flatfile_tab_pos2.txt","w",encoding="utf8") as f:
    f.write(content)


dfr = pandas.read_csv("flatfile_tab_pos2.txt", sep="\t", encoding="utf8")
dfr.head(n=3)

---------------------------------------------------------------------------
ParserError                               Traceback (most recent call last)
<ipython-input-8-473d60fe4f7d> in <module>()
----> 1 dfr = pandas.read_csv("flatfile_tab_pos2.txt", sep="\t", encoding="utf8")
      2 dfr.head(n=3)

c:\Python36_x64\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

c:\Python36_x64\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    409 
    410     try:
--> 411         data = parser.read(nrows)
    412     finally:
    413         parser.close()

c:\Python36_x64\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   1003                 raise ValueError('skipfooter not supported for iteration')
   1004 
-> 1005         ret = self._engine.read(nrows)
   1006 
   1007         if self.options.get('as_recarray'):

c:\Python36_x64\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   1746     def read(self, nrows=None):
   1747         try:
-> 1748             data = self._reader.read(nrows)
   1749         except StopIteration:
   1750             if self._first_chunk:

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read (pandas\_libs\parsers.c:10862)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas\_libs\parsers.c:11138)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas\_libs\parsers.c:11884)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows (pandas\_libs\parsers.c:11755)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.raise_parser_error (pandas\_libs\parsers.c:28765)()

ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 3


from pyensae.sql import import_flatfile_into_database
import_flatfile_into_database("flatfile_tab_pos2.db3", "flatfile_tab_pos2.txt")

  TextFile: opening file  flatfile_tab_pos2.txt
  TextFile.guess_columns: processing file  flatfile_tab_pos2.txt
  TextFile: opening file  flatfile_tab_pos2.txt
  TextFile.guess_columns: using  101  lines
  TextFile: closing file  flatfile_tab_pos2.txt
  TextFile.guess_columns: sep  '\t' nb cols 2  bestnb  100  more  {('\t', 1): 100, ('\t', 2): 1}
  TextFile.guess_columns: header  True  columns  {0: ('name', <class 'str'>), 1: ('number', <class 'int'>)}
  compiling ^(?P<name>.*)\t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: regex  ^(?P<name>.*)\t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: header  True  columns  {0: ('name', (<class 'str'>, 10)), 1: ('number', <class 'int'>)}
  [_guess_columns] sep=['\t']
  TextFile: closing file  flatfile_tab_pos2.txt
  [_guess_columns] columns_name=None
   guess with  1001 lines
   count_types  {0: {<class 'str'>: 1000}, 1: {<class 'int'>: 999, <class 'str'>: 1}}
   columns  {0: ('name', <class 'str'>), 1: ('number', <class 'int'>)}
   guess {0: ('name', (<class 'str'>, 10)), 1: ('number', <class 'int'>)}
SQL  'CREATE TABLE flatfile_tab_pos2(name TEXT,'
'       number INTEGER);'
   column_has_space False ['name', 'number']
   changes {}
    TextFileColumns (2): regex:  {0: ('name', (<class 'str'>, 10)), 1: ('number', <class 'int'>)}
  TextFile.guess_columns: processing file  flatfile_tab_pos2.txt
  TextFile: opening file  flatfile_tab_pos2.txt
  TextFile.guess_columns: using  101  lines
  TextFile: closing file  flatfile_tab_pos2.txt
  TextFile.guess_columns: sep  '\t' nb cols 2  bestnb  100  more  {('\t', 1): 100, ('\t', 2): 1}
  TextFile.guess_columns: header  True  columns  {0: ('name', <class 'str'>), 1: ('number', <class 'int'>)}
  compiling ^(?P<name>.*)\t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: regex  ^(?P<name>.*)\t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
  TextFile.guess_columns: header  True  columns  {0: ('name', (<class 'str'>, 10)), 1: ('number', <class 'int'>)}
  TextFile: opening file  flatfile_tab_pos2.txt
^(?P<name>.*)\t(?P<number>([-]?[1-9][0-9]*?)|(0?))$
error regex 0 unable to interpret line  10002 :  ''
  TextFile: closing file  flatfile_tab_pos2.txt
10001  lines imported

'flatfile_tab_pos2'


from pyensae.sql import Database
db = Database("flatfile_tab_pos2.db3")
db.connect()
df = db.to_df("SELECT * FROM flatfile_tab_pos2")
db.close()
df.head()

SQL  'SELECT * FROM flatfile_tab_pos2'

	name	number
0	one	75
1	two	73
2	three	77

	name	number
0	one	75
1	two	73
2	three	77

	name	number
0	one\ttab	100
1	one	75
2	two	73

	name	number
0	one\ttab	100
1	one	75
2	two	73

Not so clean text to tables (pandas fails)¶