Is there a tool that shows a distribution of lines of code per file of a folder? - visualization

I want to know how big files are within my repository in terms of lines of code, to see the 'health' of a repository.
In order to answer this, I would like to see a distribution (visualised or not) of the number of files for a specific range (can be 1):
#lines of code #files
1-10 1
11-20 23
etc...
(A histogram of this would be nice)
Is there quick why to get this, with for example cloc or any other (command line) tool?

A combination of cloc and Pandas can handle this. First, capture the line counts with cloc to a csv file using --by-file and --csv switches, for example
cloc --by-file --csv --out data.csv curl-7.80.0.tar.bz2
then use the Python program below to aggregate and bin the data by folders:
./aggregate_by_folder.py data.csv
The code for aggregate_by_folder.py is
#!/usr/bin/env python
import sys
import os.path
import pandas as pd
def add_folder(df):
"""
Return a Pandas dataframe with an additional 'folder' column
containing each file's parent directory
"""
header = 'github.com/AlDanial/cloc'
df = df.drop(df.columns[df.columns.str.contains(header)], axis=1)
df['folder'] = df['filename'].dropna().apply(os.path.dirname)
return df
def bin_by_folder(df):
bins = list(range(0,1000,50))
return df.groupby('folder')['code'].value_counts(bins=bins).sort_index()
def file_count_by_folder(df):
df_files = pd.pivot_table(df, index=['folder'], aggfunc='count')
file_counts = df_files.rename(columns={'blank':'file count'})
return file_counts[['file count']]
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} data.csv")
print(" where the .csv file is created with")
print(" cloc --by-file --csv --out data.csv my_code_base")
raise SystemExit
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
df = add_folder(pd.read_csv(sys.argv[1]))
print(pd.pivot_table(df, index=['folder'], aggfunc='sum'))
print('-' * 50)
print(file_count_by_folder(df))
print('-' * 50)
print(bin_by_folder(df))
if __name__ == "__main__": main()

Related

How to efficiently rename many files or partially select the names of these files during import?

how to rename the files efficiently by the number in the name (see picture)? I did not succeed with Windows PowerToys and I dont wana click each file and rename to the number (e.g. 290)
or how to read the files in this order and define a name? If I try it with a script (see below) the following output occurs:
[![ValueError: invalid literal for int() with base 10: '211001_164357_P_Scripted_Powermeasurement_Wavelength_automatic_Powermeter1_0'][1]][1]
or how to select only the numbers (290 to 230 - see picture) within the name when reading?
Script:
#import libraries
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
data_location = r'C:\Users\...\Characterization_OPO\Data'
data_folder = Path(data_location)
data = {}
allist = list(data_folder.glob('*'))
for i, file in enumerate(allist):
file = str(file)
file_name = file.split('\\')[-1]
wavelength = int(file_name.split('.')[0])
tmp = pd.read_csv(file, skiprows=20, skipfooter=59, index_col="PixelNo")
data[f'{wavelength} nm'] = tmp;
#data.plot(x='Wavelength',y='CCD_1', label=f"{wavelength} nm")
Picture:
I removed all words with windows power rename and than took the last three digits:
for i, file in enumerate(allist):
file = str(file)
file_name = file.split('\\')[-1]
wavelength = int(file_name.split('.')[0])
tmp = pd.read_csv(file, skiprows=26, skipfooter=5)
data[f'{wavelength % 1000} nm'] = tmp;
#data.plot(x='Wavelength',y='CCD_1', label=f"{wavelength} nm")

How read data from csv file in locust performance test scripts?

I am trying to read the data from csv file that contails 1 row and 5 column using the following code
def __init__(self):
super(data, self).__init__()
global data
if (data == None):
with open('var.csv', 'r') as l:
reader = csv.reader(l)
data = list(reader)
def on_start(self):
if len(data) > 0:
self.my_value = data.pop()
My output is ('sample') and I want it to be sample
Change the last line from self.my_value = data.pop() to self.my_value = data.pop()[0].
But you could also use locust plugins csv reader: https://github.com/SvenskaSpel/locust-plugins/blob/master/examples/csvreader_ex.py

How to read kaggle zip file dataset in the databricks

I want to read the zip file dataset from the kaggle but I am unable to read that dataset:
import urllib
urllib.request.urlretrieve("https://www.kaggle.com/himanshupoddar/zomato-bangalore-restaurants/downloads/zomato-bangalore-restaurants.zip", "/tmp/zomato-bangalore-restaurants.zip")
then I run shell scripting to extracting the file:
%sh
unzip /tmp/zomato-bangalore-restaurants.zip
tail -n +2 zomato-bangalore-restaurants.csv > temp.csv
rm zomato-bangalore-restaurants.csv
Then I got an error:
Archive: /tmp/zomato-bangalore-restaurants.zip
End-of-central-directory signature not found. Either this file is not
a zipfile, or it constitutes one disk of a multi-part archive. In the
latter case the central directory and zipfile comment will be found on
the last disk(s) of this archive.
unzip: cannot find zipfile directory in one of /tmp/zomato-bangalore-restaurants.zip or
/tmp/zomato-bangalore-restaurants.zip.zip, and cannot find /tmp/zomato-bangalore-restaurants.zip.ZIP, period.
tail: cannot open 'zomato-bangalore-restaurants.csv' for reading: No such file or directory
rm: cannot remove 'zomato-bangalore-restaurants.csv': No such file or directory
Note: Attempt to download a file from Kaggle is blocked because you are not logged in yet.
Here is the script to download all the competition data sets.
from requests import get, post
from os import mkdir, remove
from os.path import exists
from shutil import rmtree
import zipfile
def purge_all_downloads(db_full_path):
# Removes all the downloaded datasets
if exists(db_full_path): rmtree(db_full_path)
def datasets_are_available_locally(db_full_path, datasets):
# Returns True only if all the competition datasets are available locally in Databricks CE
if not exists(db_full_path): return False
for df in datasets:
# Assumes all the datasets end with '.csv' extention
if not exists(db_full_path + df + '.csv'): return False
return True
def remove_zip_files(db_full_path, datasets):
for df in datasets:
remove(db_full_path + df + '.csv.zip')
def unzip(db_full_path, datasets):
for df in datasets:
with zipfile.ZipFile(db_full_path + df + '.csv.zip', 'r') as zf:
zf.extractall(db_full_path)
remove_zip_files(db_full_path, datasets)
def download_datasets(competition, db_full_path, datasets, username, password):
# Downloads the competition datasets if not availible locally
if datasets_are_available_locally(db_full_path, datasets):
print 'All the competition datasets have been downloaded, extraced and are ready for you !'
return
purge_all_downloads(db_full_path)
mkdir(db_full_path)
kaggle_info = {'UserName': username, 'Password': password}
for df in datasets:
url = (
'https://www.kaggle.com/account/login?ReturnUrl=' +
'/c/' + competition + '/download/'+ df + '.csv.zip'
)
request = post(url, data=kaggle_info, stream=True)
# write data to local file
with open(db_full_path + df + '.csv.zip', "w") as f:
for chunk in request.iter_content(chunk_size = 512 * 1024):
if chunk: f.write(chunk)
# extract competition data
unzip(db_full_path, datasets)
print('done !')
For more details, refer "Download the competition data sets directly".
Hope this helps.

Unable to parse email (.msg) in python 3.6

I have set of .msg files stored in E:/ drive that I have to read and extract some information from it. For that i am using the below code in Python 3.6
from email.parser import Parser
p = Parser()
headers = p.parse(open('E:/Ratan/msg_files/Test1.msg', encoding='Latin-1'))
print('To: %s' % headers['To'])
print('From: %s' % headers['From'])
print('Subject: %s' % headers['subject'])
In the output I am getting as below.
To: None
From: None
Subject: None
I am not getting the actual values in To, FROM and subject fields.
Any thoughts why it is not printing the actual values?
Please download my sample msg file from this link:
drive.google.com/file/d/1pwWWG3BgsMKwRr0WmP8GqzG3WX4GmEy6/vi‌​ew
Here is a demonstration of how to use some of python's standard email libraries.
You didn't show us your input file in the question, and the g-drive URL is a deadlink.
The code below looks just like yours and works fine, so I don't know what is odd about your environment, modulo some Windows 'rb' binary open nonsense, CRLFs, or the Latin1 encoding.
I threw in .upper() but it does nothing beyond showing that the API is case insensitive.
#! /usr/bin/env python3
from email.parser import Parser
from pathlib import Path
import mailbox
def extract_messages(maildir, mbox_file, k=2, verbose=False):
for n, message in enumerate(mailbox.mbox(mbox_file)):
with open(maildir / f'{n}.txt', 'w') as fout:
fout.write(str(message))
hdrs = 'From Date Subject In-Reply-To References Message-ID'.split()
p = Parser()
for i in range(min(k, n)):
with open(maildir / f'{i}.txt') as fin:
msg = p.parse(fin)
print([len(msg[hdr.upper()] or '')
for hdr in hdrs])
for k, v in msg.items():
print(k, v)
print('')
if verbose:
print(msg.get_payload())
if __name__ == '__main__':
# from https://mail.python.org/pipermail/python-dev/
maildir = Path('/tmp/py-dev/')
extract_messages(maildir, maildir / '2018-January.txt')

Printing confusion matrix to file produces illegal characters

I am classifying a set of images stored as tuples in a csv file.
The confusion matrix that I get on terminal display is correct. But when I write that same conf. matrix to a file, it produces illegal characters (32bit hex).
Here's the code-
from sklearn.metrics import confusion_matrix
import numpy as np
import os
import csv
from sklearn import svm
from sklearn import cross_validation
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn import metrics
import cPickle
def prec(num):
return "%0.5f"%num
outfile = open("output/linear_svm_output.txt","a")
for dim in [20,30,40]:
images=[]
labels=[]
name = str(dim)+"x"+str(dim)+".csv"
with open(name,'r') as file:
reader = csv.reader(file,delimiter=',')
for line in file:
labels.append(line[0])
line=line[2:] # Remove the label
image=[int(pixel) for pixel in line.split(',')]
images.append(np.array(image))
clf = svm.LinearSVC()
print clf
kf = cross_validation.KFold(len(images),n_folds=10,indices=True, shuffle=True, random_state=4)
print "\nDividing dataset using `Kfold()` -:\n\nThe training dataset has been divided into " + str(len(kf)) + " parts\n"
for train, test in kf:
training_images=[]
training_labels=[]
for i in train:
training_images.append(images[i])
training_labels.append(labels[i])
testing_images=[]
testing_labels=[]
for i in test:
testing_images.append(images[i])
testing_labels.append(labels[i])
clf.fit(training_images,training_labels)
predicted = clf.predict(testing_images)
print prec(clf.score(testing_images, testing_labels))
outfile.write(prec(clf.score(testing_images, testing_labels)))
outfile.write(str(clf))
outfile.write(confusion_matrix(testing_labels, predicted))
print confusion_matrix(testing_labels, predicted)
# outfile.write(metrics.classification_report(testing_labels, predicted))
print "\nDividing dataset using `train_test_split()` -:\n"
training_images, testing_images, training_labels, testing_labels = cross_validation.train_test_split(images,labels, test_size=0.2, random_state=0)
clf = clf.fit(training_images,training_labels)
score = clf.score(testing_images,testing_labels)
predicted = clf.predict(testing_images)
print prec(score)
outfile.write(str(clf))
outfile.write(confusion_matrix(testing_labels, predicted))
print confusion_matrix(testing_labels, predicted)
# outfile.write(metrics.classification_report(testing_labels, predicted))
Output in file-
302e 3939 3338 374c 696e 6561 7253 5643
2843 3d31 2e30 2c20 636c 6173 735f 7765
...
Use the following to print the matrix to file properly:
with open(filename, 'w') as f:
f.write(np.array2string(confusion_matrix(y_test, pred), separator=', '))
Because outfile.write(confusion_matrix(testing_labels, predicted)) will write the matrix out in binary format. If you want write it in human readable text, try this if you are using python 2.x
print >> outfile, confusion_matrix(testing_labels, predicted)
It just redirect the stdout to outfile